### Data Preparation

In [55]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

# read datasets
books = pd.read_csv('./dataset/Books.csv', low_memory = False)
rating = pd.read_csv('./dataset/Ratings.csv', low_memory = False)


# drop unnecessary columns
books.drop(axis=1, labels=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)

# combining two datasets based on ISBN number
df = rating.merge(books, how='inner', on='ISBN')

# Find how many nan values are there in each column
print(df.isna().sum())

# drop NaN values
df.dropna(inplace=True)

print(df.head(10))


User-ID                0
ISBN                   0
Book-Rating            0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
dtype: int64
   User-ID        ISBN  Book-Rating            Book-Title Book-Author Year-Of-Publication         Publisher
0   276725  034545104X            0  Flesh Tones: A Novel  M. J. Rose                2002  Ballantine Books
1     2313  034545104X            5  Flesh Tones: A Novel  M. J. Rose                2002  Ballantine Books
2     6543  034545104X            0  Flesh Tones: A Novel  M. J. Rose                2002  Ballantine Books
3     8680  034545104X            5  Flesh Tones: A Novel  M. J. Rose                2002  Ballantine Books
4    10314  034545104X            9  Flesh Tones: A Novel  M. J. Rose                2002  Ballantine Books
5    23768  034545104X            0  Flesh Tones: A Novel  M. J. Rose                2002  Ballantine Books
6    28266  034545104X            0  Flesh Tones: A Nove

### Creating User Book Data Frame

In [28]:
# calculating users' rating or comment counts for each book
comment_counts = pd.DataFrame(df["Book-Title"].value_counts())

# accessing not-so-read books
rare_books = comment_counts[comment_counts["Book-Title"] < 250].index

# extracting the rare books and assigning common books
common_books = df[~df["Book-Title"].isin(rare_books)]

# making pivot table
user_book_df = common_books.pivot_table(index=["User-ID"],columns=["Book-Title"],values=["Book-Rating"])

# To get rid of the book rating column - insignificant
user_book_df.columns = user_book_df.columns.get_level_values(1)

# quick look
user_book_df.iloc[0:20,0:20]

Book-Title,1984,1st to Die: A Novel,2nd Chance,A Bend in the Road,"A Child Called \It\"": One Child's Courage to Survive""",A Heartbreaking Work of Staggering Genius,A Is for Alibi (Kinsey Millhone Mysteries (Paperback)),A Map of the World,A Painted House,A Prayer for Owen Meany,A Time to Kill,A Walk to Remember,A Widow for One Year,ANGELA'S ASHES,About a Boy,Airframe,All I Really Need to Know,Along Came a Spider (Alex Cross Novels),American Gods,Angela's Ashes (MMP) : A Memoir
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
8,,,,,,,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,,,,,,,
14,,,,,,,,,,,,,,,,,,,,
16,,,,,,,,,,,,,,,,9.0,,,,
26,,,,,,,,,,,,,,,,,,,,
44,,,,,,,,,,,,,,,,,,,,
51,,,,,,,,,,,,,,,,,,,,
67,,,,,,,,,,,,,,,,,,,,
91,,,,,,,,,,,,,,,,,,,,
95,,,,,,,,,,,,,,,,,,,,


In [29]:
user_book_df.to_csv('train_data.csv', index=False)

In [31]:
rating_count=pd.DataFrame(rating['User-ID'].value_counts())

rating_count.reset_index(inplace=True)
rating_count.rename(columns={'index':'User-ID','User-ID':'Count'},inplace=True)
print(rating_count)

        User-ID  Count
0         11676  13602
1        198711   7550
2        153662   6109
3         98391   5891
4         35859   5850
...         ...    ...
105278    69281      1
105279    69239      1
105280    69241      1
105281    69245      1
105282   276733      1

[105283 rows x 2 columns]


### Making Item-Based Book Recommendation

In [5]:
# taking random book
book_name = pd.Series(user_book_df.columns).sample(1).values[0]

# printing book name for observation
print("Book name: "+book_name)

# taking book_name's column in user_book_df and assigning book_name_df
book_name_df = user_book_df[book_name]

# Using the correlation method, we take the correlation values of the rating values given by the users with the book we have chosen and rank them from largest to smallest.
recommend = user_book_df.corrwith(book_name_df).sort_values(ascending=False)[1:10]

# Recommending
print(recommend)

Book name: Pop Goes the Weasel
Book-Title
The Queen of the Damned (Vampire Chronicles (Paperback))    0.706323
Isle of Dogs                                                0.628436
The Prince of Tides                                         0.617679
Deception Point                                             0.615102
The Beach House                                             0.601784
Unnatural Exposure                                          0.597217
The Lost World                                              0.588705
The Witching Hour (Lives of the Mayfair Witches)            0.554751
A Map of the World                                          0.546604
dtype: float64


### Collaborative Book Recommendation

In [37]:
##Popularity Based Recommender System
ratings_with_name = rating.merge(books,on='ISBN')

num_rating_df = ratings_with_name.groupby('Book-Title')['Book-Rating'].count().reset_index()
num_rating_df.rename(columns={'Book-Rating':'num_ratings'},inplace=True)

avg_rating_df = ratings_with_name.groupby('Book-Title')['Book-Rating'].mean().reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_rating'},inplace=True)

popular_df = num_rating_df.merge(avg_rating_df,on='Book-Title')
print(popular_df)

                                               Book-Title  num_ratings  avg_rating
0        A Light in the Storm: The Civil War Diary of ...            4    2.250000
1                                   Always Have Popsicles            1    0.000000
2                    Apple Magic (The Collector's series)            1    0.000000
3        Ask Lily (Young Women of Faith: Lily Series, ...            1    8.000000
4        Beyond IBM: Leadership Marketing and Finance ...            1    0.000000
...                                                   ...          ...         ...
241066                                      Ã?Â?lpiraten.            2    0.000000
241067                     Ã?Â?rger mit Produkt X. Roman.            4    5.250000
241068                                Ã?Â?sterlich leben.            1    7.000000
241069                              Ã?Â?stlich der Berge.            3    2.666667
241070                                  Ã?Â?thique en toc            2    4.000000

[24

In [47]:
popular_df = popular_df[popular_df['num_ratings']>=250].sort_values('avg_rating',ascending=False).head(50)

popular_df = popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','num_ratings','avg_rating']]

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 1000)

#print(popular_df)

In [50]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
wellread_users = x[x].index

filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(wellread_users)]

y = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=40
famous_books = y[y].index

final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')
pt.fillna(0,inplace=True)

#print(pt)

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(pt)

def recommend(book_name):
    # index fetch
    index = np.where(pt.index==book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:6]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        
        data.append(item)
    
    return data

recommend(book_name)

[['Vittorio the Vampire: New Tales of the Vampires', 'Anne Rice'],
 ['The Clinic (Alex Delaware Novels (Paperback))', 'Jonathan Kellerman'],
 ['The Midnight Club', 'James Patterson'],
 ['The Ritual Bath (Peter Decker &amp; Rina Lazarus Novels (Paperback))',
  'Faye Kellerman'],
 ['1st to Die: A Novel', 'James Patterson']]