<a href="https://colab.research.google.com/github/iAmHira19/Book_recommendation_engine/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd

# Function to load data with more flexible parsing options
def load_data(filepath, delimiter=';', encoding='latin-1'):
    try:
        return pd.read_csv(filepath, delimiter=delimiter, encoding=encoding,
                           quoting=3,  # Disable quoting to avoid issues with unclosed quotes
                           escapechar='\\',  # Handle escape characters
                           on_bad_lines='warn',  # Warn about bad lines instead of skipping
                           engine='python')  # Use the Python engine for more flexible parsing
    except Exception as e:
        print(f"Error loading CSV file {filepath}: {e}")
        return pd.DataFrame()

# Load datasets
ratings_df = load_data('/content/BX-Book-Ratings.csv')
books_df = load_data('/content/BX-Books.csv')

# Print column names and first few rows to inspect the data
print("Ratings DataFrame columns:", ratings_df.columns)
print("Ratings DataFrame head:")
print(ratings_df.head())

print("Books DataFrame columns:", books_df.columns)
print("Books DataFrame head:")
print(books_df.head())

# Check for expected columns
expected_columns_ratings = ['User-ID', 'ISBN', 'Book-Rating']
expected_columns_books = ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']

# Identify missing columns
missing_columns_ratings = [col for col in expected_columns_ratings if col not in ratings_df.columns]
missing_columns_books = [col for col in expected_columns_books if col not in books_df.columns]

if missing_columns_ratings:
    print(f"Missing columns in ratings DataFrame: {', '.join(missing_columns_ratings)}")
if missing_columns_books:
    print(f"Missing columns in books DataFrame: {', '.join(missing_columns_books)}")

# If columns are present, proceed with further steps
if not missing_columns_ratings and not missing_columns_books:
    # Data cleaning
    # Remove users with less than 200 ratings
    user_rating_counts = ratings_df['User-ID'].value_counts()
    valid_users = user_rating_counts[user_rating_counts >= 200].index
    ratings_df = ratings_df[ratings_df['User-ID'].isin(valid_users)]

    # Remove books with less than 100 ratings
    book_rating_counts = ratings_df['ISBN'].value_counts()
    valid_books = book_rating_counts[book_rating_counts >= 100].index
    ratings_df = ratings_df[ratings_df['ISBN'].isin(valid_books)]

    # Merge ratings with book details
    merged_df = pd.merge(ratings_df, books_df, on='ISBN')

    # Vectorize book titles for similarity comparison
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(merged_df['Book-Title'])

    # Compute pairwise cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Create a function to get recommendations
    def get_recommends(book_title):
        # Find the index of the book title
        if book_title not in merged_df['Book-Title'].values:
            return [book_title, []]  # Return empty list if the book is not in the dataset

        book_idx = merged_df[merged_df['Book-Title'] == book_title].index[0]

        # Get the similarity scores for the book
        sim_scores = list(enumerate(cosine_sim[book_idx]))

        # Sort the books based on similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the top 5 similar books
        sim_scores = sim_scores[1:6]

        # Get the book indices and scores
        book_indices = [i[0] for i in sim_scores]
        book_scores = [i[1] for i in sim_scores]

        # Get the book titles
        recommended_books = merged_df.iloc[book_indices]['Book-Title'].tolist()

        # Return the list of recommended books with their distances
        return [book_title, list(zip(recommended_books, book_scores))]

    # Test the function
    print(get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))"))
else:
    print("Unable to proceed due to missing columns.")


Skipping line 161765: Expected 3 fields in line 161765, saw 4
Skipping line 201981: Expected 3 fields in line 201981, saw 4
Skipping line 6: Expected 8 fields in line 6, saw 9
Skipping line 23: Expected 8 fields in line 23, saw 9
Skipping line 25: Expected 8 fields in line 25, saw 10
Skipping line 39: Expected 8 fields in line 39, saw 9
Skipping line 85: Expected 8 fields in line 85, saw 9
Skipping line 86: Expected 8 fields in line 86, saw 9
Skipping line 120: Expected 8 fields in line 120, saw 9
Skipping line 142: Expected 8 fields in line 142, saw 9
Skipping line 193: Expected 8 fields in line 193, saw 9
Skipping line 195: Expected 8 fields in line 195, saw 9
Skipping line 205: Expected 8 fields in line 205, saw 9
Skipping line 241: Expected 8 fields in line 241, saw 9
Skipping line 293: Expected 8 fields in line 293, saw 9
Skipping line 294: Expected 8 fields in line 294, saw 9
Skipping line 304: Expected 8 fields in line 304, saw 9
Skipping line 316: Expected 8 fields in line 316,

Ratings DataFrame columns: Index(['"User-ID"', '"ISBN"', '"Book-Rating"'], dtype='object')
Ratings DataFrame head:
  "User-ID"        "ISBN" "Book-Rating"
0  "276725"  "034545104X"           "0"
1  "276726"  "0155061224"           "5"
2  "276727"  "0446520802"           "0"
3  "276729"  "052165615X"           "3"
4  "276729"  "0521795028"           "6"
Books DataFrame columns: Index(['"ISBN"', '"Book-Title"', '"Book-Author"', '"Year-Of-Publication"',
       '"Publisher"', '"Image-URL-S"', '"Image-URL-M"', '"Image-URL-L"'],
      dtype='object')
Books DataFrame head:
         "ISBN"                                       "Book-Title"  \
0  "0195153448"                              "Classical Mythology"   
1  "0002005018"                                     "Clara Callan"   
2  "0060973129"                             "Decision in Normandy"   
3  "0374157065"  "Flu: The Story of the Great Influenza Pandemi...   
4  "0399135782"                           "The Kitchen God's Wife"   

      