# Lesson 23: recommendation systems demonstration

This notebook demonstrates key concepts for recommendation systems

1. Collaborative filtering
    - Memory based
    - Model based

2. Content-based filtering
3. Hybrid filtering

## Notebook set up

### Imports

In [None]:
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.metrics.pairwise import cosine_similarity  # To compute similarity scores
from sklearn.decomposition import TruncatedSVD  # For matrix factorization

### Dataset

Load animes & ratings data.

In [None]:
# Load anime information from CSV file
animes = pd.read_csv('anime.csv')
animes.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [38]:
animes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [None]:
# Load user ratings from CSV file
ratings = pd.read_csv('rating.csv')
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


Check the size and structure of the ratings dataset to understand the data volume.

In [40]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [None]:
# Randomly sample 50,000 ratings for faster computation
sample_ratings = ratings.sample(n=50000, random_state=315)
sample_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 5012406 to 6767779
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   user_id   50000 non-null  int64
 1   anime_id  50000 non-null  int64
 2   rating    50000 non-null  int64
dtypes: int64(3)
memory usage: 1.5 MB


In [None]:
# Count unique users and animes in the sample
num_users = sample_ratings['user_id'].nunique()
num_animes = sample_ratings['anime_id'].nunique()

print(f"Number of unique users in sample: {num_users}")
print(f"Number of unique animes in sample: {num_animes}")

Number of unique users in sample: 26940
Number of unique animes in sample: 4863


## 1. Collaborative filtering

### 1.1. Memory based collaborative filtering

Create a user-item matrix where each row is a user, each column is an anime, and values are ratings.

In [None]:
# Create a user-item matrix using pivot table
user_item_matrix = sample_ratings.pivot_table(
    index='user_id',  # Users as rows
    columns='anime_id',  # Animes as columns
    values='rating'  # Ratings as values
)

# Fill missing values (unrated animes) with 0
user_item_filled = user_item_matrix.fillna(0)

print('User-Item Matrix shape:', user_item_filled.shape)
user_item_filled.head()

User-Item Matrix shape: (26940, 4863)


anime_id,1,5,6,7,8,15,16,17,18,19,...,33524,33558,33569,33606,33740,33741,33798,33964,34103,34240
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Compute cosine similarity between all pairs of animes to find which animes have similar rating patterns across users.

In [48]:
user_item_filled.T

user_id,3,4,5,7,11,13,14,17,21,26,...,73490,73491,73495,73499,73500,73501,73503,73507,73510,73515
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Compute cosine similarity between animes (transpose to compare columns)
item_similarity = cosine_similarity(user_item_filled.T)

# Convert to DataFrame with anime IDs as row and column labels
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

print('Item similarity matrix shape:', item_similarity_df.shape)
item_similarity_df.head()

Item similarity matrix shape: (4863, 4863)


anime_id,1,5,6,7,8,15,16,17,18,19,...,33524,33558,33569,33606,33740,33741,33798,33964,34103,34240
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.026386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.026386,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Create helper functions to convert between anime IDs and names for more readable output.

In [None]:
def get_anime_name(anime_id):
    """Get anime name from ID"""
    result = animes[animes['anime_id'] == anime_id]['name']
    return result.values[0] if len(result) > 0 else f'Unknown (ID: {anime_id})'

def get_anime_id(anime_name):
    """Get anime ID from name"""
    result = animes[animes['name'] == anime_name]['anime_id']
    return result.values[0] if len(result) > 0 else None

# Test the helper function
print(f"Anime ID 1: {get_anime_name(1)}")

Anime ID 1: Cowboy Bebop


Demonstrate memory-based collaborative filtering by finding the top 5 animes most similar to a target anime based on user rating patterns.

In [None]:
# Select target anime
anime_id = 1

# Get similarity scores and sort in descending order
similar_animes = item_similarity_df[anime_id].sort_values(ascending=False)

print(f'Top 5 animes similar to "{get_anime_name(anime_id)}":')
print()

# Display top 5 (skip first one since it's the anime itself)
for anime_id_similar, score in similar_animes[1:6].items():
    print(f'{get_anime_name(anime_id_similar)}: {score:.4f}')

Top 5 animes similar to "Cowboy Bebop":

Koi☆Sento: 0.1218
Hikyou Tanken Fam &amp; Ihrlie: 0.1218
Houkago 2: Saiyuri: 0.1218
Lupin III: Ikiteita Majutsushi: 0.1206
Gyakuten Majo Saiban: Chijo na Majo ni Sabakarechau The Animation: 0.1200


### 1.2. Model based collaborative filtering

Use matrix factorization (SVD) to reduce dimensionality and fill in missing ratings by learning latent features of users and animes.

In [None]:
# Create SVD model with 50 latent features
svd_model = TruncatedSVD(n_components=50, random_state=315)

# Fit model and transform user-item matrix to user features
user_features = svd_model.fit_transform(user_item_filled)

# Reconstruct ratings matrix by multiplying user and item features
predicted_ratings = np.dot(user_features, svd_model.components_)

# Convert back to DataFrame with original indices
predicted_ratings_df = pd.DataFrame(
    predicted_ratings,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)

print('Predicted ratings matrix shape:', predicted_ratings_df.shape)
predicted_ratings_df.head()

Predicted ratings matrix shape: (26940, 4863)


anime_id,1,5,6,7,8,15,16,17,18,19,...,33524,33558,33569,33606,33740,33741,33798,33964,34103,34240
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,-0.123558,0.001347,0.015531,-0.00136,1.970482e-05,-0.000653,0.006206,0.0006840352,-6.027962e-05,-0.007176,...,-0.006508,0.000537,-0.001496,0.000805,-0.0001090123,-5.79641e-05,0.000618,6.392568e-05,4.7e-05,0.008004
4,-0.000143,-3e-05,0.000164,-1e-05,-2.217778e-07,-1.4e-05,-1.4e-05,-8.815549e-07,-5.654995e-07,-1.6e-05,...,-1e-05,2e-06,-2e-06,-1e-05,-3.455666e-07,2.782329e-08,-1e-06,-2.231423e-08,-3e-06,1e-05
5,-0.002929,-0.000565,-0.000439,8e-06,-9.950065e-07,0.000218,0.000513,3.954197e-05,2.285818e-05,-0.000223,...,-0.000115,0.000107,1.4e-05,6.6e-05,-2.426508e-06,-6.658181e-07,4.7e-05,9.848932e-06,-8.3e-05,0.000776
7,-0.005738,-0.000151,-0.000507,5e-06,9.891335e-07,8.7e-05,5.6e-05,8.833031e-06,5.095776e-06,0.000141,...,-1.7e-05,1e-05,1e-05,-1.9e-05,-1.102061e-06,-1.01135e-07,4e-06,3.40646e-07,5e-06,-2.9e-05
11,0.292969,0.04347,0.075774,-0.001745,-4.406893e-05,0.001446,-0.012093,-0.0006346332,0.001169022,0.003752,...,-0.006507,0.004725,-0.005896,0.00205,-1.419158e-05,-9.158018e-05,-0.001123,0.0006495072,0.000187,-0.022802


Demonstrate model-based collaborative filtering by recommending unwatched animes to a user based on predicted ratings from SVD.

In [None]:
# Select first user from the matrix
user_id = user_item_matrix.index[0]

# Get predicted ratings for this user
user_predictions = predicted_ratings_df.loc[user_id]

# Find animes the user hasn't rated (missing values in original matrix)
unrated_animes = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].isna()]

# Get predictions for unrated animes and sort by predicted rating
recommendations = user_predictions[unrated_animes.index].sort_values(ascending=False)

print(f'Top 5 recommended animes for user {user_id}:')
print(recommendations.head())

Top 5 recommended animes for user 3:
anime_id
18679    0.843389
2236     0.649259
223      0.471859
9989     0.449235
6880     0.438066
Name: 3, dtype: float64


## 2. Content-based filtering

Examine the content features (genre, type) available for each anime to use in content-based filtering.

In [None]:
# Display relevant features for content-based filtering
animes[['anime_id', 'name', 'genre', 'type']].head(10)

Unnamed: 0,anime_id,name,genre,type
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV


Create a function to calculate similarity between animes based on their genres using Jaccard similarity (intersection over union).

In [49]:
# Convert genre strings to sets for easier comparison
animes['genre_set'] = animes['genre'].fillna('').apply(lambda x: set(x.split(', ')))

def genre_similarity(genres1, genres2):
    """Calculate Jaccard similarity between two genre sets"""

    # Return 0 if either set is empty
    if len(genres1) == 0 or len(genres2) == 0:
        return 0

    # Calculate intersection (common genres) and union (all unique genres)
    intersection = len(genres1.intersection(genres2))
    union = len(genres1.union(genres2))

    # Jaccard similarity = intersection / union
    return intersection / union if union > 0 else 0

Select a target anime to demonstrate content-based filtering using genre similarity.

In [50]:
# Choose anime to find similar content for
target_anime_id = 1
target_anime = animes[animes['anime_id'] == target_anime_id].iloc[0]
target_genres = target_anime['genre_set']

print(f"Target anime: {target_anime['name']}")
print(f"Genres: {target_anime['genre']}")

Target anime: Cowboy Bebop
Genres: Action, Adventure, Comedy, Drama, Sci-Fi, Space


Demonstrate content-based filtering by finding animes with the most similar genres to the target anime.

In [51]:
# Calculate genre similarity for all animes
animes['similarity'] = animes['genre_set'].apply(
    lambda x: genre_similarity(target_genres, x)
)

# Find top similar animes (excluding the target itself)
similar_animes = animes[animes['anime_id'] != target_anime_id].sort_values(
    'similarity', 
    ascending=False
)[['name', 'genre', 'similarity']].head(5)

print('Top 5 similar animes based on genre:')
similar_animes.head()

Top 5 similar animes based on genre:


Unnamed: 0,name,genre,similarity
1465,Cowboy Bebop: Yose Atsume Blues,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",1.0
6568,Seihou Tenshi Angel Links,"Action, Adventure, Comedy, Drama, Romance, Sci...",0.857143
2735,Uchuu Kaizoku Captain Harlock: Arcadia-gou no ...,"Action, Adventure, Drama, Sci-Fi, Space",0.833333
5721,Kaitei Choutokkyuu: Marine Express,"Action, Adventure, Comedy, Drama, Sci-Fi",0.833333
1073,Waga Seishun no Arcadia,"Action, Adventure, Drama, Sci-Fi, Space",0.833333


## 3. Hybrid filtering

Combine collaborative filtering and content-based filtering using a weighted average to leverage both user behavior and content features.

In [54]:
# Get collaborative filtering scores (based on user ratings)
collab_score = item_similarity_df[target_anime_id]

# Get content-based scores (based on genre similarity)
content_score = animes.set_index('anime_id')['similarity']

# Find animes that exist in both scoring methods
common_animes = collab_score.index.intersection(content_score.index)

# Combine scores with equal weights (50% each)
hybrid_score = (
    0.5 * collab_score[common_animes] + 
    0.5 * content_score[common_animes]
)

# Sort and get top 5 (excluding the target anime itself)print(hybrid_recommendations)
hybrid_recommendations = hybrid_score.sort_values(ascending=False)[1:6]
print(f'Top 5 hybrid recommendations for anime_id {target_anime_id}:')

for rec in hybrid_recommendations.items():
    anime_id_rec, score = rec
    print(f'{get_anime_name(anime_id_rec)}: {score:.4f}')

Top 5 hybrid recommendations for anime_id 1:
Cowboy Bebop: Yose Atsume Blues: 0.5000
Seihou Tenshi Angel Links: 0.4294
Seihou Bukyou Outlaw Star: 0.4167
Ginga Tetsudou Monogatari: 0.4167
Waga Seishun no Arcadia: Mugen Kidou SSX: 0.4167
