In [1]:
import pandas as pd

interaction_df = pd.read_csv("C:/Users/minge/Downloads/csv file/interactions_data_file.csv")

print(interaction_df.head())

                            user_id   book_id  \
0  8842281e1d1347389f2ab93d60773d4d  10893214   
1  8842281e1d1347389f2ab93d60773d4d  33282947   
2  8842281e1d1347389f2ab93d60773d4d  11387515   
3  8842281e1d1347389f2ab93d60773d4d  24396144   
4  8842281e1d1347389f2ab93d60773d4d  20484662   

                          review_id  is_read  rating review_text_incomplete  \
0  5d0e4e8825c68740703f65a18813fc93    False       0                    NaN   
1  f171a68daa8092d8aea3dccc2e025a81    False       0                    NaN   
2  2fd3cd1acb30b099c135e358669639da    False       0                    NaN   
3  d210e41fcc7e6dcd6ae896844a38a024    False       0                    NaN   
4  a99f9fa4ec4fd94cc2419c78af2086a8    False       0                    NaN   

                       date_added                    date_updated read_at  \
0  Fri Feb 24 09:00:30 -0800 2017  Fri Feb 24 09:00:30 -0800 2017     NaN   
1  Fri Feb 10 10:47:53 -0800 2017  Fri Feb 10 10:48:21 -0800 2017     NaN   

In [3]:
print(interaction_df.dtypes)

user_id                   object
book_id                    int64
review_id                 object
is_read                     bool
rating                     int64
review_text_incomplete    object
date_added                object
date_updated              object
read_at                   object
started_at                object
dtype: object


In [5]:
# Preprocessing

interaction_df['rating'] = pd.to_numeric(interaction_df['rating'], errors='coerce')
required_columns = ['user_id', 'book_id', 'rating']
df_preprocessed = (interaction_df[required_columns]
                  .drop_duplicates()
                  .dropna()
                  .query("rating >= 3.5"))

# filter to top active users/books
top_users = df_preprocessed['user_id'].value_counts().head(20000).index
top_books = df_preprocessed['book_id'].value_counts().head(20000).index
filtered_df = df_preprocessed[
    df_preprocessed['user_id'].isin(top_users) & 
    df_preprocessed['book_id'].isin(top_books)
]

In [7]:
print("After preprocessing:")
print("Unique users:", df_preprocessed['user_id'].nunique())
print("Unique books:", df_preprocessed['book_id'].nunique())
print("Total ratings:", len(df_preprocessed))

After preprocessing:
Unique users: 430848
Unique books: 112365
Total ratings: 4573179


In [9]:
from scipy.sparse import csr_matrix

# Create mappings
user_to_idx = {user: idx for idx, user in enumerate(filtered_df['user_id'].unique())}
book_to_idx = {book: idx for idx, book in enumerate(filtered_df['book_id'].unique())}

# Create sparse matrix
rows = filtered_df['user_id'].map(user_to_idx)
cols = filtered_df['book_id'].map(book_to_idx)
values = filtered_df['rating']

sparse_matrix = csr_matrix((values, (rows, cols)), 
                        shape=(len(user_to_idx), len(book_to_idx)))

In [11]:
#pip install scikit-surprise

In [13]:
# Process pivot table in batches for Users and Items
batch_size = 5000
dense_chunks = []

for i in range(0, 20000, batch_size):
    # Get batch of users
    batch_users = list(user_to_idx.keys())[i:i+batch_size]
    batch_df = filtered_df[filtered_df['user_id'].isin(batch_users)]
    
    # Create dense batch matrix
    batch_matrix = batch_df.pivot_table(
        index='user_id',
        columns='book_id',
        values='rating',
        fill_value=0
    ).reindex(index=batch_users, columns=book_to_idx.keys(), fill_value=0)
    
    dense_chunks.append(batch_matrix)

# Combine to final pivot table
user_item_matrix = pd.concat(dense_chunks)

In [15]:
user_item_matrix 

book_id,23310161,817720,502362,1969280,17290220,1027760,231850,460548,4948,76933,...,9675860,17927219,13139295,26213166,10266194,10531351,10772394,8456290,7574926,15942760
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8842281e1d1347389f2ab93d60773d4d,4.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7b2e5fe9fd353fecf3eeebb4850b88d3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4672eb229c808b792b8ea95f01f19784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
559d843b319087e12f48282e386e401f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cd6522e9018f2f77332ec74f928f8c45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
edb4e5298a821ecdcf49b49e99a9848a,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0b9260c7c314956d4dd9be954adbb932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
df61fb0b19aa2c1c277b1deb8e575824,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d509e8da95c233acac0753b8783d426a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Memory-based collaborative filtering
1. User-based filtering
2. Item-based filtering

mean: The average rating given to the book by the similar users.

count: The number of similar users who rated that book.

1. User-based filtering

In [23]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

def get_top_similar_users_knn(target_user_id, user_item_df, n=5):
    user_ids = user_item_df.index
    # Convert to sparse 
    user_item_matrix = csr_matrix(user_item_df.values)

    # Find similar users
    knn = NearestNeighbors(n_neighbors=n+1, metric='cosine', algorithm='brute')
    knn.fit(user_item_matrix)
    
    # target_idx = user_item_matrix.index.get_loc(target_user_id)
    target_idx = user_ids.get_loc(target_user_id)
    distances, indices = knn.kneighbors(user_item_matrix[target_idx])
    
    # Return similar users (excluding the target user themselves)
    similar_users = user_ids[indices.flatten()[1:]]
    similarities = 1 - distances.flatten()[1:]  # Convert distances to similarities
    return pd.Series(similarities, index=similar_users)

"""Get book recommendations from similar users"""
def memory_based_recommendations(target_user_id, user_item_matrix, filtered_df, n_users=5, n_books=5):
   
    # Get similar users
    similar_users = get_top_similar_users_knn(target_user_id, user_item_matrix, n_users)
    
    # Get books rated by similar users but not by target user
    target_books = set(filtered_df[filtered_df['user_id'] == target_user_id]['book_id'])
    similar_users_books = filtered_df[filtered_df['user_id'].isin(similar_users.index)]
    
    # Filter out books already rated by target user
    candidate_books = similar_users_books[~similar_users_books['book_id'].isin(target_books)]
    
    # Calculate weighted ratings based on user similarity
    candidate_books = candidate_books.copy()
    candidate_books['weighted_rating'] = candidate_books.apply(
        lambda x: x['rating'] * similar_users[x['user_id']], axis=1
    )
    
    # Get top rated books with weighted average
    book_scores = candidate_books.groupby('book_id').agg({
        'weighted_rating': 'sum',
        'user_id': 'count',
        'rating': 'mean'
    })
    
    # Normalize the weighted ratings by dividing by sum of similarities
    book_scores['similarity_sum'] = book_scores['user_id'].apply(
        lambda count: sum([similar_users[user] for user in 
                          candidate_books[candidate_books['book_id'] == book_scores.index[book_scores['user_id'] == count].values[0]]['user_id']])
    )
    
    book_scores['normalized_score'] = book_scores['weighted_rating'] / book_scores['similarity_sum']
    
    # Sort and get top books
    top_books = book_scores.sort_values('normalized_score', ascending=False).head(n_books)
    top_books = top_books.reset_index()
    
    # Merge with book titles
    top_books_with_titles = pd.merge(top_books, books_df[['book_id', 'title']], on='book_id', how='left')
    
    return top_books_with_titles[['book_id', 'title', 'normalized_score', 'rating', 'user_id']]

2. Item-based filtering

In [26]:
from collections import defaultdict
"""Compute pairwise item similarities using cosine distance."""
def get_item_similarities(item_item_matrix, n_neighbors=10, batch_size=1000):
    n_items = item_item_matrix.shape[0]
    
    all_similarities = []
    all_indices = []
    
    knn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine', algorithm='brute')
    knn.fit(item_item_matrix)

    for start in range(0, n_items, batch_size):
        end = min(start + batch_size, n_items)
        batch = item_item_matrix[start:end]

        distances, indices = knn.kneighbors(batch)
        similarities = 1 - distances

        all_similarities.append(similarities)
        all_indices.append(indices)
    
    # Combine
    full_similarities = np.vstack(all_similarities)
    full_indices = np.vstack(all_indices)

    return full_similarities, full_indices


"""Generate recommendations based on item-item similarities."""
def item_based_recommendations(target_user_id, user_item_matrix, filtered_df, books_df, n_books=5):
    # Get books rated by the target user
    target_books = filtered_df[filtered_df['user_id'] == target_user_id]['book_id'].values
    
    if len(target_books) == 0:
        return pd.DataFrame()  # Handle cold-start

    # Create item-item matrix (transpose of user-item matrix)
    item_item_matrix = user_item_matrix.T 
    
    # Compute item similarities (precompute once and cache for production)
    item_similarities, item_indices = get_item_similarities(item_item_matrix)
    
    # Map book_id to matrix index
    book_to_idx = {book_id: idx for idx, book_id in enumerate(user_item_matrix.columns)}
    
    # For each book rated by the user, find similar books
    candidate_scores = defaultdict(float)
    for book_id in target_books:
        if book_id not in book_to_idx:
            continue
        book_idx = book_to_idx[book_id]
        similar_indices = item_indices[book_idx][1:]  # Exclude self
        similar_scores = item_similarities[book_idx][1:]
        
        # Aggregate scores across all similar items
        for sim_idx, sim_score in zip(similar_indices, similar_scores):
            similar_book_id = user_item_matrix.columns[sim_idx]
            candidate_scores[similar_book_id] += sim_score
    
    # Filter out books already rated by the user
    candidate_scores = {
        book_id: score for book_id, score in candidate_scores.items()
        if book_id not in target_books
    }
    
    # Sort and get top books
    top_books = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)[:n_books]
    top_books = pd.DataFrame(top_books, columns=['book_id', 'item_score'])
    
    # Merge with book titles
    top_books = pd.merge(top_books, books_df[['book_id', 'title']], on='book_id', how='left')
    return top_books

Train SVD

In [28]:
"""Train SVD model on user-item matrix"""
def train_svd_model(sparse_matrix, n_factors=50):
    
    # Compute row means efficiently on sparse matrix
    row_sums = sparse_matrix.sum(axis=1).A1  # Convert to 1D array
    row_counts = np.diff(sparse_matrix.indptr)  # Number of non-zero elements per row
    row_means = np.zeros_like(row_sums, dtype=np.float64)
    
    # Avoid division by zero
    non_zero_mask = row_counts > 0
    row_means[non_zero_mask] = row_sums[non_zero_mask] / row_counts[non_zero_mask]
    
    # Perform SVD directly on sparse matrix
    u, sigma, vt = svds(sparse_matrix, k=50)
    
    # Convert sigma to diagonal matrix
    sigma_diag = np.diag(sigma)
    
    return (u, sigma, vt, row_means)

Model-based collaborative filtering - SVD

In [31]:
"""Get book recommendations using SVD """
def model_based_recommendations(target_user_id, user_item_matrix, books_df, svd_data, n_books=5):
    # Find target user's index
    target_idx = user_item_matrix.index.get_loc(target_user_id)
    
    # Unpack SVD data
    u, sigma, vt, user_ratings_mean = svd_data

    # Build book_to_idx dictionary here
    book_to_idx = {book_id: idx for idx, book_id in enumerate(user_item_matrix.columns)}

    # Reconstruct the ratings for the target user only (not the whole matrix)
    user_u = u[target_idx].reshape(1, -1)
    user_predicted = user_ratings_mean[target_idx] + np.dot(np.dot(user_u, np.diag(sigma)), vt)
    user_predicted = user_predicted.flatten()
    
    # Get books that user hasn't rated
    user_rated_books = user_item_matrix.loc[target_user_id]
    user_rated_indices = user_rated_books[user_rated_books > 0].index
    rated_indices = [book_to_idx[book] for book in user_rated_indices if book in book_to_idx]
    
    # Create mask for unrated items (1 for unrated, 0 for rated)
    mask = np.ones(len(book_to_idx), dtype=bool)
    mask[rated_indices] = False
    
    # Apply mask to get only predictions for unrated items
    unrated_predictions = user_predicted[mask]
    unrated_indices = np.arange(len(book_to_idx))[mask]
    
    # Get top n books based on predictions
    top_indices = unrated_predictions.argsort()[-n_books:][::-1]
    top_book_indices = unrated_indices[top_indices]
    top_predictions = unrated_predictions[top_indices]
    
    # Convert indices back to book_ids
    idx_to_book = {idx: book_id for book_id, idx in book_to_idx.items()}
    top_book_ids = [idx_to_book[idx] for idx in top_book_indices]
    
    # Create dataframe with results
    results = pd.DataFrame({
        'book_id': top_book_ids,
        'predicted_rating': top_predictions
    })
    
    # Merge with book titles
    results = pd.merge(results, books_df[['book_id', 'title']], on='book_id', how='left')
    
    return results

Collaborative Filtering
- Combination

In [40]:
"""Get book recommendations using a hybrid approach"""
def collaborative_recommendations(target_user_id, user_item_matrix, filtered_df, books_df, 
                          svd_data, alpha=0.4, beta=0.3, gamma=0.3, n_users=5, n_books=10):
    """
    Combine user-based, item-based, and model-based recommendations.
    Weights:
    - alpha: user-based
    - beta: item-based
    - gamma: model-based (alpha + beta + gamma = 1)
    """
    # Get recommendations from all three methods
    user_recs = memory_based_recommendations(target_user_id, user_item_matrix, filtered_df, n_users, n_books*3)
    item_recs = item_based_recommendations(target_user_id, user_item_matrix, filtered_df, books_df, n_books*3)
    model_recs = model_based_recommendations(target_user_id, user_item_matrix, books_df, svd_data, n_books*3)
    
    # Normalize scores to [0, 1] for fair comparison
    def normalize_scores(df, score_col):
        min_score = df[score_col].min()
        max_score = df[score_col].max()
        df[score_col] = (df[score_col] - min_score) / (max_score - min_score + 1e-10)
        return df
    
    user_recs = normalize_scores(user_recs, 'normalized_score')
    item_recs = normalize_scores(item_recs, 'item_score')
    model_recs = normalize_scores(model_recs, 'predicted_rating')
    
    # Create dictionaries for easy lookup
    user_scores = dict(zip(user_recs['book_id'], user_recs['normalized_score']))
    item_scores = dict(zip(item_recs['book_id'], item_recs['item_score']))
    model_scores = dict(zip(model_recs['book_id'], model_recs['predicted_rating']))
    
    # Combine scores from all methods
    all_books = set(user_scores.keys()) | set(item_scores.keys()) | set(model_scores.keys())
    hybrid_scores = {}
    
    for book_id in all_books:
        hybrid_scores[book_id] = (
            alpha * user_scores.get(book_id, 0) +
            beta * item_scores.get(book_id, 0) +
            gamma * model_scores.get(book_id, 0)
        )
    
    # Sort and return top books
    top_books = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:n_books]
    results = pd.DataFrame(top_books, columns=['book_id', 'hybrid_score'])
    
    # Add component scores for debugging/analysis
    results['user_score'] = results['book_id'].map(user_scores).fillna(0)
    results['item_score'] = results['book_id'].map(item_scores).fillna(0)
    results['model_score'] = results['book_id'].map(model_scores).fillna(0)
    
    # Merge with book titles
    results = pd.merge(results, books_df[['book_id', 'title']], on='book_id', how='left')
    results.rename(columns={'user_score': 'memory_score'}, inplace=True)
    return results

Testing

In [42]:
books_df = pd.read_csv("C:/Users/minge/Downloads/csv file/books_data_file.csv")
print("\n=== Book Recommendation System ===")
print("This system uses **User-based collaborative filtering** to recommend books.")

# Train SVD model (always used in hybrid)
print("Training SVD model...")
sparse_mat = csr_matrix(user_item_matrix.values)
try:
    svd_data = train_svd_model(sparse_mat, n_factors=20)
    print("SVD model trained successfully!")
except MemoryError:
    print("Error: Not enough memory for SVD model. Exiting.")
    exit()

while True:
    print("\nAvailable user IDs:", list(user_item_matrix.index)[:5], "... (and more)")
    target_user = input("Enter user ID to get book recommendations (Q to quit): ").strip()
    
    if target_user.lower() == 'q':
        break

    if target_user not in user_item_matrix.index:
        print(f"Error: User '{target_user}' not found")
        continue

    try:
        n_users = int(input("How many similar users to consider? (Default 5): ") or 5)
        n_books = int(input("How many book recommendations? (Default 5): ") or 5)
        
    except ValueError:
        print("Invalid input. Using default values.")
        n_users, n_books, alpha = 5, 5, 0.5

    recommendations = collaborative_recommendations(
        target_user, user_item_matrix, filtered_df, books_df, 
        svd_data, n_users=n_users, n_books=n_books
    )
    print(f"\nTop {n_books} book recommendations for user {target_user} (collaborative):")
    print(recommendations[['title', 'hybrid_score', 'memory_score', 'model_score']])

    if input("\nFind more recommendations? (y/n): ").lower() != 'y':
        break

print("\nProgram ended.")


=== Book Recommendation System ===
This system uses **User-based collaborative filtering** to recommend books.
Training SVD model...
SVD model trained successfully!

Available user IDs: ['8842281e1d1347389f2ab93d60773d4d', '7b2e5fe9fd353fecf3eeebb4850b88d3', '4672eb229c808b792b8ea95f01f19784', '559d843b319087e12f48282e386e401f', 'cd6522e9018f2f77332ec74f928f8c45'] ... (and more)


Enter user ID to get book recommendations (Q to quit):  8842281e1d1347389f2ab93d60773d4d
How many similar users to consider? (Default 5):  5
How many book recommendations? (Default 5):  5



Top 5 book recommendations for user 8842281e1d1347389f2ab93d60773d4d (collaborative):
                title  hybrid_score  memory_score  model_score
0   A Little Princess      0.453105      1.000000     0.177018
1      Goodnight Moon      0.399786      0.000000     1.000000
2     Charlotte's Web      0.397176      0.000000     0.323920
3  The Cat in the Hat      0.328177      0.000000     0.644413
4        Black Beauty      0.234429      0.384355     0.268958



Find more recommendations? (y/n):  n



Program ended.
