In [1]:
!pip install scikit-learn pandas numpy

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from google.colab import files




In [3]:
print("Please upload books.csv, users.csv, ratings.csv")
uploaded = files.upload()

books = pd.read_csv('/content/Books.csv')
users = pd.read_csv('/content/Users.csv')
ratings = pd.read_csv('/content/Ratings.csv')

print("Data loaded")
print("Books:", books.shape)
print("Ratings:", ratings.shape)
print("Users:", users.shape)

Please upload books.csv, users.csv, ratings.csv


Saving Users.csv to Users (1).csv
Saving DeepRec.png to DeepRec (1).png
Saving Ratings.csv to Ratings (1).csv
Saving Books.csv to Books (1).csv


  books = pd.read_csv('/content/Books.csv')


Data loaded
Books: (271360, 8)
Ratings: (1149780, 3)
Users: (278858, 3)


In [4]:
print("Missing values in Books:\n", books.isnull().sum())
print("Missing values in Users:\n", users.isnull().sum())
print("Missing values in Ratings:\n", ratings.isnull().sum())

print("Duplicates -> Books:", books.duplicated().sum(),
      "Ratings:", ratings.duplicated().sum(),
      "Users:", users.duplicated().sum())


Missing values in Books:
 ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64
Missing values in Users:
 User-ID          0
Location         0
Age         110762
dtype: int64
Missing values in Ratings:
 User-ID        0
ISBN           0
Book-Rating    0
dtype: int64
Duplicates -> Books: 0 Ratings: 0 Users: 0


In [6]:
# Handle missing values

# Books: fill missing text with "Unknown"
books['Book-Author'].fillna("Unknown", inplace=True)
books['Publisher'].fillna("Unknown", inplace=True)

# Books: fill missing images with placeholder
books['Image-URL-L'].fillna("N/A", inplace=True)

# Users: fill missing Age with 0 (or drop column if not needed)
users['Age'].fillna(0, inplace=True)

print("After cleaning:")
print("Books missing:\n", books.isnull().sum())
print("Users missing:\n", users.isnull().sum())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books['Book-Author'].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books['Publisher'].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

After cleaning:
Books missing:
 ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
dtype: int64
Users missing:
 User-ID     0
Location    0
Age         0
dtype: int64


In [8]:
ratings_with_name = ratings.merge(books, on='ISBN')

num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating': 'num_ratings'}, inplace=True)

avg_rating_df = ratings_with_name.groupby('Book-Title').mean(numeric_only=True)['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'avg_rating'}, inplace=True)

popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
popular_df = popular_df[popular_df['num_ratings'] >= 250].sort_values('avg_rating', ascending=False).head(50)
popular_df = popular_df.merge(books, on='Book-Title').drop_duplicates('Book-Title')[
    ['Book-Title', 'Book-Author', 'Image-URL-M', 'num_ratings', 'avg_rating']
]


In [9]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
active_users = x[x].index
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(active_users)]

y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = y[y].index
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]


In [10]:
pt = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pt.fillna(0, inplace=True)

similarity_scores = cosine_similarity(pt)


In [11]:
def recommend(book_name):
    if book_name not in pt.index:
        print("Book not found in dataset")
        return
    index = np.where(pt.index == book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),
                           key=lambda x: x[1], reverse=True)[1:6]
    print(f"Recommendations for '{book_name}':")
    for i in similar_items:
        print("-", pt.index[i[0]])

# Example
recommend("Harry Potter and the Chamber of Secrets (Book 2)")


Recommendations for 'Harry Potter and the Chamber of Secrets (Book 2)':
- Harry Potter and the Prisoner of Azkaban (Book 3)
- Harry Potter and the Goblet of Fire (Book 4)
- Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
- Harry Potter and the Sorcerer's Stone (Book 1)
- Harry Potter and the Order of the Phoenix (Book 5)


In [12]:
pickle.dump(popular_df, open('popular.pkl', 'wb'))
pickle.dump(pt, open('pt.pkl', 'wb'))
pickle.dump(similarity_scores, open('similarity_scores.pkl', 'wb'))
pickle.dump(books, open('books.pkl', 'wb'))
pickle.dump(final_ratings, open('final_ratings.pkl', 'wb'))

print("Models saved successfully")


Models saved successfully
