In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle


ModuleNotFoundError: No module named 'numpy'

# 1. Load and Preprocess Data
# In this section, we load the raw datasets, perform initial cleaning by renaming columns for consistency, and merge them into a single comprehensive DataFrame.

Data Loading and Initial Merging

In [5]:
print("Loading datasets...")
# --- FIX: Added encoding='latin-1' to handle special characters in the data ---
books = pd.read_csv('Books.csv', low_memory=False, encoding='latin-1', on_bad_lines='skip')
ratings = pd.read_csv('Ratings.csv', low_memory=False, encoding='latin-1', on_bad_lines='skip')

print("Cleaning and preprocessing data...")
books.rename(columns={'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher', 'Image-URL-L':'image_url'}, inplace=True)
ratings.rename(columns={'User-ID':'user_id', 'Book-Rating':'rating'}, inplace=True)

# Merge ratings and books on the 'ISBN' column
ratings_with_name = ratings.merge(books, on='ISBN')

print("Data loaded and preprocessed.")
print("Ratings with book names shape:", ratings_with_name.shape)

Loading datasets...
Cleaning and preprocessing data...
Data loaded and preprocessed.
Ratings with book names shape: (254997, 10)


# 2. Build Popularity-Based Recommender Data
# This model recommends books based on simple popularity. We identify the most popular books by considering only those with a significant number of ratings (>= 250) and then sorting them by their average rating.

 Calculate Popularity Metrics and Create popular_df

In [6]:
print("\nBuilding popularity model data...")
# Calculate the number of ratings for each book
num_rating_df = ratings_with_name.groupby('title').count()['rating'].reset_index()
num_rating_df.rename(columns={'rating':'num_ratings'}, inplace=True)

# Calculate the average rating for each book
avg_rating_df = ratings_with_name.groupby('title').mean(numeric_only=True)['rating'].reset_index()
avg_rating_df.rename(columns={'rating':'avg_rating'}, inplace=True)

# Merge to create a dataframe with both metrics
popular_df = num_rating_df.merge(avg_rating_df, on='title')

# Filter for books with at least 250 ratings
popular_df = popular_df[popular_df['num_ratings'] >= 250].sort_values('avg_rating', ascending=False)

# Merge with the main books dataframe to get details (author, image_url)
popular_df = popular_df.merge(books, on='title').drop_duplicates('title')[['title', 'author', 'image_url', 'num_ratings', 'avg_rating']]

print("Popularity DataFrame created. Shape:", popular_df.shape)


Building popularity model data...
Popularity DataFrame created. Shape: (30, 5)


# 3. Build Collaborative Filtering Model
# This model is based on the "wisdom of the crowd." It recommends books by
# finding patterns in user ratings. We filter the data to include only
# experienced users and frequently-rated books to reduce noise.

Filter Data and Build User-Item Matrix

In [7]:
print("\nBuilding collaborative filtering model...")
# Filter for users who have rated more than 200 books
x = ratings_with_name.groupby('user_id').count()['rating'] > 200
exp_users = x[x].index
filtered_rating = ratings_with_name[ratings_with_name['user_id'].isin(exp_users)]

# Filter for books with at least 50 ratings from this experienced group
y = filtered_rating.groupby('title').count()['rating'] >= 50
famous_books = y[y].index
final_ratings = filtered_rating[filtered_rating['title'].isin(famous_books)]

# Create the user-item pivot table
pt = final_ratings.pivot_table(index='title', columns='user_id', values='rating')
pt.fillna(0, inplace=True)

print("Pivot table for collaborative filtering created. Shape:", pt.shape)


Building collaborative filtering model...
Pivot table for collaborative filtering created. Shape: (56, 177)


Calculate Collaborative Similarity

We use Cosine Similarity to measure how similarly books were rated by users.

In [8]:
collaborative_similarity_scores = cosine_similarity(pt)
print("Collaborative similarity matrix created. Shape:", collaborative_similarity_scores.shape)


Collaborative similarity matrix created. Shape: (56, 56)


 # 4. Build Content-Based Filtering Model
# This model recommends books based on their content attributes (author and
# publisher). It uses TF-IDF to convert text features into numerical vectors,
# which can then be compared for similarity.

 Feature Engineering and Content Similarity

In [9]:
print("\nBuilding content-based filtering model...")
# Use books that are present in the collaborative model for consistency
content_df = books[books['title'].isin(pt.index)].copy()
content_df.drop_duplicates(subset='title', inplace=True)
content_df.reset_index(drop=True, inplace=True)

# Create 'tags' from author and publisher
content_df['tags'] = content_df['author'] + " " + content_df['publisher']
content_df['tags'] = content_df['tags'].fillna('').astype(str)

# Vectorize tags using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(content_df['tags'])

# Calculate similarity
content_similarity_scores = cosine_similarity(tfidf_matrix)
print("Content-based similarity matrix created. Shape:", content_similarity_scores.shape)


Building content-based filtering model...
Content-based similarity matrix created. Shape: (56, 56)


# 5. Offline Model Evaluation

# To validate our collaborative model, we perform an offline evaluation. We split the data into a training set (past) and a testing set (future) and measure how well the model can predict books a user will like.

Run Evaluation

In [10]:
print("\n--- Starting Offline Model Evaluation ---")
train_data, test_data = train_test_split(final_ratings, test_size=0.2, random_state=42)
train_pt = train_data.pivot_table(index='title', columns='user_id', values='rating')
train_pt.fillna(0, inplace=True)
train_similarity_scores = cosine_similarity(train_pt)

def calculate_precision_recall(test_data, train_pt, similarity_scores, k=5):
    title_to_index = {title: i for i, title in enumerate(train_pt.index)}
    test_users = test_data['user_id'].unique()
    total_precision, total_recall, processed_users = 0, 0, 0

    for user_id in test_users:
        true_positives = set(test_data[test_data['user_id'] == user_id]['title'])
        train_positives = set(train_data[train_data['user_id'] == user_id]['title'])
        if not train_positives or not true_positives: continue

        last_liked_book = list(train_positives)[-1]
        if last_liked_book not in title_to_index: continue

        index = title_to_index[last_liked_book]
        similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:k+1]
        recommended_titles = {train_pt.index[i[0]] for i in similar_items}

        hits = len(recommended_titles.intersection(true_positives))
        if hits > 0:
            precision = hits / k
            recall = hits / len(true_positives)
            total_precision += precision
            total_recall += recall
            processed_users += 1

    avg_precision = total_precision / processed_users if processed_users > 0 else 0
    avg_recall = total_recall / processed_users if processed_users > 0 else 0
    return avg_precision, avg_recall

precision_at_5, recall_at_5 = calculate_precision_recall(test_data, train_pt, train_similarity_scores, k=5)
print(f"Evaluation Results (k=5):")
print(f"Average Precision@5: {precision_at_5:.4f}")
print(f"Average Recall@5: {recall_at_5:.4f}")
print("---------------------------------------")


--- Starting Offline Model Evaluation ---
Evaluation Results (k=5):
Average Precision@5: 0.2444
Average Recall@5: 0.3337
---------------------------------------


# 6. Export Final Artifacts

# Finally, we save all the necessary Python objects (DataFrames and similarity matrices) into a single .pkl file. The Flask application will load this file to make live recommendations without re-running the analysis.

Save the .pkl file

In [11]:
print("\nExporting all production data to recommender_data.pkl...")
recommender_data = {
    'popular_df': popular_df,
    'pt': pt,
    'books': books,
    'collaborative_similarity_scores': collaborative_similarity_scores,
    'content_df': content_df,
    'content_similarity_scores': content_similarity_scores
}
pickle.dump(recommender_data, open('recommender_data.pkl', 'wb'))

print("\n-------------------------------------------------")
print("All models built and data exported successfully!")
print("-------------------------------------------------")


Exporting all production data to recommender_data.pkl...

-------------------------------------------------
All models built and data exported successfully!
-------------------------------------------------
