Explicit Collabrotive Filtering
scenario:
We have many customers, and many items (movies)
each customer had rated some of the movies (explicit feedback)
similar customers will like similar movies (and by similar customers here we only mean in terms of liking movies, no other feature)

In [3]:
import numpy as np
import pandas as pd

In [25]:
# 1. CREATE A TINY FAKE DATASET
# 5 customers, 20 movies
customers = [f"C{i}" for i in range(1, 6)]      # C1..C5
movies = [f"M{i}" for i in range(1, 21)]        # M1..M20

rng = np.random.RandomState(42)

rows = []

for c in customers:
    # each customer rates 6 random movies
    rated_movies = rng.choice(movies, size=6, replace=False)
    for m in rated_movies:
        rating = rng.randint(1, 20)  # rating between 1 and 5
        rows.append([c, m, rating])

ratings_df = pd.DataFrame(rows, columns=["customer_id", "movie_id", "rating"])
print("Sample ratings:")
print(ratings_df.head(4))

Sample ratings:
  customer_id movie_id  rating
0          C1       M1       2
1          C1      M18       1
2          C1      M16      12
3          C1       M2      12


In [26]:
# 2. BUILD USER-ITEM MATRIX
# Rows = customers, Columns = movies, Values = ratings (NaN if not rated)
user_item = ratings_df.pivot_table(
    index="customer_id",
    columns="movie_id",
    values="rating"
)
print("\nUser-Item matrix:")
print(user_item)


User-Item matrix:
movie_id      M1  M11   M12  M13  M14   M16  M17   M18   M19    M2   M20  \
customer_id                                                                
C1           2.0  NaN   NaN  NaN  NaN  12.0  NaN   1.0   NaN  12.0   NaN   
C2           NaN  2.0   NaN  NaN  9.0   NaN  NaN  12.0   NaN   8.0   NaN   
C3           4.0  NaN  14.0  NaN  NaN   NaN  NaN   8.0  14.0   NaN   NaN   
C4           NaN  NaN   NaN  NaN  NaN   NaN  1.0   NaN   3.0  19.0   NaN   
C5           NaN  3.0   NaN  3.0  NaN   NaN  NaN   NaN   1.0   NaN  14.0   

movie_id       M3   M4    M6    M7   M8    M9  
customer_id                                    
C1            NaN  NaN  10.0   NaN  NaN  17.0  
C2            NaN  NaN  15.0   NaN  7.0   NaN  
C3            NaN  NaN   NaN  16.0  NaN  15.0  
C4            8.0  3.0  17.0   NaN  NaN   NaN  
C5           15.0  5.0   NaN   NaN  NaN   NaN  


In [27]:
# 3. SIMPLE USER-BASED COLLABORATIVE FILTERING
# We will:
#  - compute similarity between users (cosine similarity)
#  - predict rating for a customer-item as a weighted average of ratings from similar users

def cosine_similarity(u, v):
    """Compute cosine similarity between two 1D numpy arrays, ignoring NaNs."""
    # only keep positions where both have non-NaN values
    mask = ~np.isnan(u) & ~np.isnan(v)
    if mask.sum() == 0:
        return 0.0
    u_masked = u[mask]
    v_masked = v[mask]
    num = np.dot(u_masked, v_masked)
    den = np.linalg.norm(u_masked) * np.linalg.norm(v_masked)
    if den == 0:
        return 0.0
    return num / den

 

def compute_user_similarities(user_item_matrix):
    """Return a DataFrame of user-user similarities."""
    users = user_item_matrix.index.tolist()
    matrix = user_item_matrix.values
    sim_matrix = np.zeros((len(users), len(users)))
    for i in range(len(users)):
        for j in range(len(users)):
            if i == j:
                sim = 1.0
            else:
                sim = cosine_similarity(matrix[i], matrix[j])
            sim_matrix[i, j] = sim
    sim_df = pd.DataFrame(sim_matrix, index=users, columns=users)
    return sim_df

user_sim = compute_user_similarities(user_item)
print("\nUser-User similarity matrix:")
print(user_sim.round(2))


User-User similarity matrix:
      C1    C2   C3    C4    C5
C1  1.00  0.79  0.9  1.00  0.00
C2  0.79  1.00  1.0  0.94  1.00
C3  0.90  1.00  1.0  1.00  1.00
C4  1.00  0.94  1.0  1.00  0.96
C5  0.00  1.00  1.0  0.96  1.00


In [28]:

def predict_rating(target_user, target_movie, user_item_matrix, user_sim_matrix):
    """
    Predict rating of target_user for target_movie using
    weighted average of ratings from other similar users.
    """
    if target_movie not in user_item_matrix.columns:
        return np.nan
    # ratings of all users for this movie
    movie_ratings = user_item_matrix[target_movie]
    # users who rated this movie
    users_who_rated = movie_ratings.dropna().index.tolist()
    if target_user not in user_sim_matrix.index:
        return np.nan
    # similarities from target_user to these users
    sims = user_sim_matrix.loc[target_user, users_who_rated].values
    ratings = movie_ratings[users_who_rated].values
    # keep only positive similarities
    mask = sims > 0
    if mask.sum() == 0:
        return np.nan
    sims = sims[mask]
    ratings = ratings[mask]
    # weighted average
    pred = np.dot(sims, ratings) / sims.sum()
    return pred

In [29]:
# 4. GENERATE SIMPLE RECOMMENDATIONS FOR ONE CUSTOMER
target_user = "C1"
# movies C1 has not rated yet
rated_movies_by_C1 = user_item.loc[target_user].dropna().index.tolist()
unrated_movies = [m for m in movies if m not in rated_movies_by_C1]
predictions = []

for m in unrated_movies:
    pred = predict_rating(target_user, m, user_item, user_sim)
    predictions.append((m, pred))

# filter out NaNs and sort by highest predicted rating
predictions = [(m, p) for (m, p) in predictions if not np.isnan(p)]
predictions.sort(key=lambda x: x[1], reverse=True)

print(f"\nMovies recommended for {target_user}:")

for movie_id, score in predictions[:5]:
    print(f"  {movie_id}: predicted rating {score:.2f}")


Movies recommended for C1:
  M7: predicted rating 16.00
  M12: predicted rating 14.00
  M14: predicted rating 9.00
  M19: predicted rating 8.23
  M3: predicted rating 8.00
