<a href="https://colab.research.google.com/github/hira-14/movie_recommender/blob/main/03_content_based_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import scipy.sparse as sp
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = '/content/drive/MyDrive/ml-1m/ml-1m'

In [None]:
df = pd.read_pickle(DATA_PATH + '/fe_data.pkl')

In [None]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zipcode,title,genres,...,Sci-Fi,Thriller,War,Western,release_year,user_total_ratings,movie_total_ratings,prev_ts,recency_days,session_within_7d
31,1,3186,4,2000-12-31 22:00:19,F,1,10,48067,"Girl, Interrupted (1999)",Drama,...,0,0,0,0,1999,53,431,NaT,0,1
22,1,1270,5,2000-12-31 22:00:55,F,1,10,48067,Back to the Future (1985),Comedy|Sci-Fi,...,1,0,0,0,1985,53,2583,2000-12-31 22:00:19,0,1
27,1,1721,4,2000-12-31 22:00:55,F,1,10,48067,Titanic (1997),Drama|Romance,...,0,0,0,0,1997,53,1546,2000-12-31 22:00:55,0,1
37,1,1022,5,2000-12-31 22:00:55,F,1,10,48067,Cinderella (1950),Animation|Children's|Musical,...,0,0,0,0,1950,53,577,2000-12-31 22:00:55,0,1
24,1,2340,3,2000-12-31 22:01:43,F,1,10,48067,Meet Joe Black (1998),Romance,...,0,0,0,0,1998,53,344,2000-12-31 22:00:55,0,1


In [None]:
with open(DATA_PATH + '/tfidf_title.pkl', 'rb') as f:
    tfidf = pickle.load(f)
tfidf_matrix = sp.load_npz('/content/drive/MyDrive/ml-1m/ml-1m/tfidf_title_matrix.npz')

print("Data loaded:")
print(f"- Movies: {df['movie_id'].nunique()}")
print(f"- TF-IDF matrix shape: {tfidf_matrix.shape}")

Data loaded:
- Movies: 3706
- TF-IDF matrix shape: (1000209, 2000)


In [None]:
# Get unique movie profiles
movie_profiles = df.drop_duplicates('movie_id')[['movie_id', 'title']].set_index('movie_id')

# Add genre columns
genre_cols = [col for col in df.columns if col in [
    'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
    'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]]

for genre in genre_cols:
    # Get genre values for unique movies
    genre_values = df.drop_duplicates('movie_id').set_index('movie_id')[genre]
    movie_profiles[genre] = genre_values

# Get TF-IDF vectors for unique movies
unique_titles = movie_profiles['title'].tolist()
tfidf_matrix_unique = tfidf.transform(unique_titles)

# Combine features
genre_features = movie_profiles[genre_cols].values.astype(float)
combined_features = sp.hstack([genre_features, tfidf_matrix_unique], format='csr')

# Normalize features
normalized_features = normalize(combined_features)

print("\nMovie profile matrix shape:", normalized_features.shape)
print("Number of unique movies:", len(movie_profiles))
print("First movie:", movie_profiles.iloc[0]['title'])
print("Genres:", movie_profiles.iloc[0][genre_cols].to_dict())
print("Sample normalized features (first 10):", normalized_features[0].toarray()[0, :10])


Movie profile matrix shape: (3706, 2018)
Number of unique movies: 3706
First movie: Girl, Interrupted (1999)
Genres: {'Action': 0, 'Adventure': 0, 'Animation': 0, "Children's": 0, 'Comedy': 0, 'Crime': 0, 'Documentary': 0, 'Drama': 1, 'Fantasy': 0, 'Film-Noir': 0, 'Horror': 0, 'Musical': 0, 'Mystery': 0, 'Romance': 0, 'Sci-Fi': 0, 'Thriller': 0, 'War': 0, 'Western': 0}
Sample normalized features (first 10): [0.         0.         0.         0.         0.         0.
 0.         0.70710678 0.         0.        ]


In [None]:
# Compute cosine similarity matrix
content_sim_matrix = cosine_similarity(normalized_features)

# Convert to sparse format for storage
sparse_sim_matrix = sp.csr_matrix(content_sim_matrix)
sp.save_npz(DATA_PATH + '/content_sim_matrix.npz', sparse_sim_matrix)

print("\nSimilarity matrix shape:", content_sim_matrix.shape)
print("Sparsity:", f"{100 * (1 - sparse_sim_matrix.nnz / np.prod(sparse_sim_matrix.shape)):.2f}%")
print("Sample similarity scores for first movie:")
print(content_sim_matrix[0, :5])  # Similarity to first 5 movies


Similarity matrix shape: (3706, 3706)
Sparsity: 64.50%
Sample similarity scores for first movie:
[1.         0.         0.40824829 0.         0.        ]


In [None]:
def get_content_recommendations(user_id, n=10, rating_threshold=4.0):
    """Generate content-based recommendations for a user"""
    # Get user's highly rated movies
    user_ratings = df[df['user_id'] == user_id]
    liked_movies = user_ratings[user_ratings['rating'] >= rating_threshold]

    if liked_movies.empty:
        return pd.Series([], name='title')

    # Get indices of liked movies
    movie_idx_map = {movie_id: idx for idx, movie_id in enumerate(movie_profiles.index)}
    liked_indices = [movie_idx_map[mid] for mid in liked_movies['movie_id']]

    # Aggregate similarity scores
    sim_scores = content_sim_matrix[liked_indices].mean(axis=0)

    # Get top recommendations (excluding already rated)
    rated_movies = user_ratings['movie_id'].unique()
    candidate_mask = ~np.isin(movie_profiles.index, rated_movies)
    candidate_scores = sim_scores * candidate_mask

    top_indices = np.argsort(candidate_scores)[::-1][:n]
    recommendations = movie_profiles.iloc[top_indices]['title']

    return recommendations

# Test with sample user
sample_user = 1
recs = get_content_recommendations(sample_user, n=10)
print(f"\nTop 10 content-based recommendations for user {sample_user}:")
print(recs)


Top 10 content-based recommendations for user 1:
movie_id
1812             Wide Awake (1998)
34                     Babe (1995)
1014              Pollyanna (1960)
2138         Watership Down (1978)
2059       Parent Trap, The (1998)
241                   Fluke (1995)
2218       Juno and Paycock (1930)
957     Scarlet Letter, The (1926)
2777                  Cobra (1925)
2226              Ring, The (1927)
Name: title, dtype: object


In [None]:
# Save recommendations for sample user
recs_df = pd.DataFrame({
    'user_id': sample_user,
    'movie_id': recs.index,
    'title': recs.values,
    'rank': range(1, len(recs)+1)
})
recs_df.to_csv(DATA_PATH + '/content_recs_sample.csv', index=False)

# Spot-check recommendations
print("\nSpot-checking recommendations:")
user_liked = df[(df['user_id'] == sample_user) & (df['rating'] >= 4)]['title'].head(3)
print(f"User {sample_user}'s liked movies:")
print(user_liked.values)

print("\nTop recommendations:")
print(recs.head(5).values)

# Check similarity for a blockbuster movie
def check_similar_movies(movie_title, n=5):
    try:
        movie_id = movie_profiles[movie_profiles['title'] == movie_title].index[0]
        idx = list(movie_profiles.index).index(movie_id)
        sim_scores = list(enumerate(content_sim_matrix[idx]))
        sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
        similar_movies = movie_profiles.iloc[[i[0] for i in sorted_scores]]['title']
        return similar_movies
    except:
        return "Movie not found"

print("\nMost similar to 'Toy Story (1995)':")
print(check_similar_movies('Toy Story (1995)'))

print("\nMost similar to 'The Matrix (1999)':")
print(check_similar_movies('The Matrix (1999)'))


Spot-checking recommendations:
User 1's liked movies:
['Girl, Interrupted (1999)' 'Back to the Future (1985)' 'Titanic (1997)']

Top recommendations:
['Wide Awake (1998)' 'Babe (1995)' 'Pollyanna (1960)'
 'Watership Down (1978)' 'Parent Trap, The (1998)']

Most similar to 'Toy Story (1995)':
movie_id
3114                        Toy Story 2 (1999)
2355                      Bug's Life, A (1998)
3751                        Chicken Run (2000)
1064    Aladdin and the King of Thieves (1996)
2141                  American Tail, An (1986)
Name: title, dtype: object

Most similar to 'The Matrix (1999)':
Movie not found


# Content-Based Movie Recommendation System: Methodology & Analysis

## **1. Methodology Overview**

Content-based filtering (CBF) recommends movies to users by analyzing the **attributes** (content) of movies a user has liked in the past, rather than relying on the preferences of other users.

### **Pipeline Steps:**

#### **A. Feature Construction**
- Each movie is described by a **feature vector** combining:
  - **Genres:** (e.g., Action, Comedy, Drama, etc.), one-hot encoded.
  - **Title (TF-IDF):** A vectorized representation (using TF-IDF) capturing keywords and themes from the movie title.
- All features are concatenated and **normalized** to ensure comparability.

#### **B. Movie Profile Matrix**
- Construct a matrix with each movie as a row and its features (genres + TF-IDF) as columns.
- Shape: `(number_of_movies, number_of_features)`.

#### **C. Cosine Similarity Matrix**
- Compute pairwise cosine similarity between all movie vectors.
- Resulting matrix: `(number_of_movies, number_of_movies)`, where each entry indicates how similar two movies are in terms of their content features.

#### **D. Generating Recommendations**
- For a given user:
  - Identify all movies the user rated highly (e.g., `rating >= 4`).
  - For each liked movie, find similar movies using the cosine similarity matrix.
  - Aggregate similarity scores across all liked movies to compute a "similarity score" for each candidate movie.
  - Recommend the top-N movies with the highest scores that the user hasn't already rated.

---

## **2. Code Intuition**

- **TF-IDF Vectorization** captures the importance of words in movie titles, highlighting unique themes or sequels.
- **Genre Encoding** gives structure to user preferences (e.g., always liking Animation or Action).
- **Cosine Similarity** is chosen for its ability to compare sparse, high-dimensional vectors.
- **Exclusion of Already Seen Movies** ensures recommendations are novel to the user.

---

## **3. Sample Results Interpretation**

### **Sample Outputs**

- **Number of unique movies:** `3706`
- **Profile matrix shape:** `(3706, 2018)`
- **Similarity matrix shape:** `(3706, 3706)` (each cell shows how similar two movies are).
- **Sparsity:** `64.5%` (many movies are only weakly similar to others, as expected).
- **Sample recommendation for User 1:**
  - *Wide Awake (1998)*
  - *Babe (1995)*
  - *Pollyanna (1960)*
  - *Watership Down (1978)*
  - *Parent Trap, The (1998)*
- **Spot-check**: The recommendations often match the *style, genre, or mood* of previously liked movies.
- **Similarity check**: "Toy Story (1995)" returns family-friendly animated movies and sequels (e.g., "Toy Story 2", "Bug's Life", "Chicken Run").

### **Key Insights**

- **Strengths:**
  - Works even for new users as long as they have rated a few movies.
  - Produces explainable recommendations ("We recommended X because it’s similar to Y you liked").
  - Can discover "hidden gems" with similar content but less popularity.

- **Weaknesses:**
  - May miss out on *collaborative effects* (e.g., popularity, trends, or social signals).
  - Can over-specialize: if a user only rates a narrow type of movie, recommendations may lack diversity.
  - Highly dependent on the quality and granularity of content features (genres, keywords).

- **Typical Applications:**
  - Used as a building block for hybrid systems (combined with collaborative filtering and session-based signals).
  - Useful in domains with rich metadata (movies, news, products).

---

## **4. Conclusion**

The Content-Based Filtering method in this pipeline robustly recommends movies that match a user's explicit preferences based on movie content.  
However, its scope is limited by the richness of content features and may benefit from combination with collaborative or session-based methods for improved diversity and serendipity.

