In [1]:
import pandas as pd


books_data = pd.read_csv(
    r"C:\Users\justino\Desktop\books.csv",  # Use raw string for file path
    quotechar='"',  # Handle quoted fields properly
    on_bad_lines="warn"  # Skip problematic rows and warn about them
)


print("Dataset loaded successfully.")
print(books_data.info())

# Calculate the mean rating across all books (C)
C = books_data['average_rating'].mean()

# Calculate the 90th percentile of the number of ratings (m)
m = books_data['ratings_count'].quantile(0.90)

# Filter books that meet the minimum ratings requirement
popular_books = books_data[books_data['ratings_count'] >= m]

# Define a function to compute the weighted rating
def weighted_rating(x, m=m, C=C):
    v = x['ratings_count']
    R = x['average_rating']
    return (v / (v + m) * R) + (m / (v + m) * C)

# Calculate the weighted rating for the filtered books
popular_books.loc[:, 'score'] = popular_books.apply(weighted_rating, axis=1)

# Sort the books based on the score
popular_books = popular_books.sort_values(by='score', ascending=False)

# Display the top 10 popular books
popular_books_top_10 = popular_books[['title', 'authors', 'average_rating', 'ratings_count', 'score']].head(10)
print("Top 10 Popular Books:")
print(popular_books_top_10)

# Save the results to a file (optional)
popular_books_top_10.to_csv(r"C:\Users\justino\Desktop\top_10_popular_books.csv", index=False)


Dataset loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11123 non-null  int64  
 1   title               11123 non-null  object 
 2   authors             11123 non-null  object 
 3   average_rating      11123 non-null  float64
 4   isbn                11123 non-null  object 
 5   isbn13              11123 non-null  int64  
 6   language_code       11123 non-null  object 
 7     num_pages         11123 non-null  int64  
 8   ratings_count       11123 non-null  int64  
 9   text_reviews_count  11123 non-null  int64  
 10  publication_date    11123 non-null  object 
 11  publisher           11123 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB
None
Top 10 Popular Books:
                                                  title  \
0     Harry Potter and the H

Skipping line 4704: expected 12 fields, saw 13
Skipping line 5879: expected 12 fields, saw 13
Skipping line 8981: expected 12 fields, saw 13

  books_data = pd.read_csv(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  popular_books.loc[:, 'score'] = popular_books.apply(weighted_rating, axis=1)


In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load the dataset, skipping malformed rows
books_data = pd.read_csv(
    r"C:\Users\justino\Desktop\books.csv",  # Replace with your actual file path
    quotechar='"',  # Handle quoted fields properly
    on_bad_lines="skip"  # Skip problematic rows
)

# Step 2: Clean column names
books_data.columns = books_data.columns.str.strip()

# Step 3: Use TF-IDF Vectorizer to process the authors column

tfidf = TfidfVectorizer(stop_words="english")  # Ignore common English words
tfidf_matrix = tfidf.fit_transform(books_data["authors"])  # Convert authors to numerical format

# Step 4: Calculate similarity between all books based on authors

cosine_sim = cosine_similarity(tfidf_matrix)

# Step 5: Create a mapping of book titles to their index

indices = pd.Series(books_data.index, index=books_data["title"]).drop_duplicates()

# Step 6: Define a function to recommend books
def recommend_books(title):
    # Check if the book exists in the dataset
    if title not in indices:
        return f"The title '{title}' is not in the dataset."

    # Get the index of the book
    idx = indices[title]

    # Get similarity scores for this book with all others
    sim_scores = cosine_sim[idx]
    sim_scores = sim_scores.flatten()  # Ensure it's a 1D array

    # Pair the indices with their similarity scores
    sim_scores = list(enumerate(sim_scores))

    # Sort books by similarity score (highest to lowest)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the 10 most similar books (ignoring itself)
    top_similar_indices = [i[0] for i in sim_scores[1:11]]

    # Filter out-of-bounds indices
    valid_indices = [i for i in top_similar_indices if i < len(books_data)]

    # Log any invalid indices (for debugging)
    invalid_indices = [i for i in top_similar_indices if i >= len(books_data)]
    if invalid_indices:
        print(f"Warning: Invalid indices skipped - {invalid_indices}")

    # Return the details of the top 10 similar books
    return books_data.iloc[valid_indices][["title", "authors", "average_rating"]]

# Step 7: Test the recommender system
test_title = "Libra"
result = recommend_books(test_title)

# Display the recommendations
if isinstance(result, str):  # If an error message is returned
    print(result)
else:
    print("Recommended Books:")
    print(result)


Recommended Books:
                   title      authors  average_rating
166           Underworld  Don DeLillo            3.92
167                Libra  Don DeLillo            3.99
168            Americana  Don DeLillo            3.43
169          Running Dog  Don DeLillo            3.43
170           Cosmopolis  Don DeLillo            3.22
171   Great Jones Street  Don DeLillo            3.48
172            The Names  Don DeLillo            3.64
3214         White Noise  Don DeLillo            3.87
3215           Americana  Don DeLillo            3.43
3216  Love-Lies-Bleeding  Don DeLillo            3.41
