In [69]:
import re
import pickle
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix
from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

PREPROCESSING

In [70]:
books = pd.read_csv(r"Books.csv", delimiter=';', error_bad_lines=False, encoding='ISO-8859-1', warn_bad_lines=False)
users = pd.read_csv(r"Users.csv", delimiter=';', error_bad_lines=False, encoding='ISO-8859-1', warn_bad_lines=False)
ratings = pd.read_csv(r"Book-Ratings.csv", delimiter=';', error_bad_lines=False, encoding='ISO-8859-1', warn_bad_lines=False)

print("Books Data:    ", books.shape)
print("Users Data:    ", users.shape)
print("Books-ratings: ", ratings.shape)

Books Data:     (271360, 8)
Users Data:     (278858, 3)
Books-ratings:  (1149780, 3)


In [71]:
books.drop(['Image-URL-S','Image-URL-L'], axis=1, inplace=True)

In [72]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-M            0
dtype: int64

In [73]:
books.loc[books['Book-Author'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
187689,9627982032,The Credit Suisse Guide to Managing Your Personal Wealth,,1995,Edinburgh Financial Publishing,http://images.amazon.com/images/P/9627982032.01.MZZZZZZZ.jpg


In [74]:
books.loc[books['Publisher'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,,http://images.amazon.com/images/P/193169656X.01.MZZZZZZZ.jpg
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,,http://images.amazon.com/images/P/1931696993.01.MZZZZZZZ.jpg


In [75]:
books.at[187689 ,'Book-Author'] = 'Other'
books.at[128890 ,'Publisher'] = 'Other'
books.at[129037 ,'Publisher'] = 'Other'

In [76]:
books['Year-Of-Publication'].unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, '2000', '1995', '1999', '2004',
       '2003', '1990', '1994', '1986', '1989', '2002', '1981', '1993',
       '1983', '1982', '1976', '1991', '1977', '1998', '1992', '1996',
       '0', '1997', '2001', '1974', '1968', '1987', '1984', '1988',
       '1963', '1956', '1970', '1985', '1978', '1973', '1980'

In [77]:
books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
209538,078946697X,"DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\"";Michael Teitelbaum""",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/078946697X.01.LZZZZZZZ.jpg
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\"";James Buckley""",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0789466953.01.LZZZZZZZ.jpg


In [78]:
pd.set_option('display.max_colwidth', -1)

In [79]:
books.loc[books['Year-Of-Publication'] == 'Gallimard',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-Marie Gustave Le ClÃ?Â©zio""",2003,Gallimard,http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/2070426769.01.LZZZZZZZ.jpg


In [80]:
books.at[220731 ,'Publisher'] = 'Gallimard'
books.at[220731 ,'Year-Of-Publication'] = '2003'
books.at[220731 ,'Book-Title'] = 'Peuple du ciel - Suivi de Les bergers '
books.at[220731 ,'Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'

In [81]:
books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
209538,078946697X,"DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\"";Michael Teitelbaum""",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/078946697X.01.LZZZZZZZ.jpg
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\"";James Buckley""",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0789466953.01.LZZZZZZZ.jpg


In [82]:
books.at[209538 ,'Publisher'] = 'DK Publishing Inc'
books.at[209538 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
books.at[209538 ,'Book-Author'] = 'Michael Teitelbaum'

books.at[221678 ,'Publisher'] = 'DK Publishing Inc'
books.at[221678 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'
books.at[209538 ,'Book-Author'] = 'James Buckley'


In [83]:
books['Year-Of-Publication'] = (books['Year-Of-Publication']).astype(int)
print(sorted(list(books['Year-Of-Publication'].unique())))


[0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021, 2024, 2026, 2030, 2037, 2038, 2050]


In [84]:
count = Counter(books['Year-Of-Publication'])
[k for k, v in count.items() if v == max(count.values())]
print(count)

Counter({2002: 17627, 1999: 17431, 2001: 17359, 2000: 17234, 1998: 15766, 1997: 14890, 2003: 14359, 1996: 14030, 1995: 13546, 1994: 11796, 1993: 10602, 1992: 9906, 1991: 9389, 1990: 8660, 1989: 7936, 1988: 7492, 1987: 6529, 1986: 5841, 2004: 5839, 1985: 5343, 1984: 4986, 0: 4618, 1983: 4499, 1982: 4197, 1981: 3279, 1980: 2676, 1979: 2208, 1978: 2131, 1977: 1897, 1976: 1597, 1975: 1219, 1974: 1023, 1973: 923, 1972: 772, 1971: 540, 1970: 459, 1969: 339, 1968: 233, 1966: 183, 1967: 176, 1965: 173, 1964: 149, 1960: 133, 1961: 132, 1963: 132, 1962: 123, 1959: 103, 1958: 77, 1957: 76, 1956: 74, 1955: 70, 1953: 63, 1954: 54, 2005: 46, 1951: 40, 1940: 36, 1952: 34, 1920: 33, 1950: 32, 1911: 19, 1942: 14, 1947: 14, 1946: 13, 1930: 13, 1923: 11, 1949: 11, 1941: 10, 1948: 9, 1939: 9, 1943: 8, 1945: 8, 1936: 7, 1938: 7, 2030: 7, 1929: 7, 1901: 7, 1937: 5, 1932: 5, 1933: 4, 1944: 4, 2020: 3, 1931: 3, 1900: 3, 1935: 3, 2006: 3, 2011: 2, 1925: 2, 1926: 2, 1928: 2, 1927: 2, 2050: 2, 1902: 2, 1924: 2, 

In [85]:
books.loc[books['Year-Of-Publication'] > 2021, 'Year-Of-Publication'] = 2002
books.loc[books['Year-Of-Publication'] == 0, 'Year-Of-Publication'] = 2002

In [86]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Image-URL-M            0
dtype: int64

In [87]:
books['ISBN'] = books['ISBN'].str.upper()

In [88]:
books.drop_duplicates(keep='last', inplace=True) 
books.reset_index(drop = True, inplace = True)

In [89]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271047 entries, 0 to 271046
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271047 non-null  object
 1   Book-Title           271047 non-null  object
 2   Book-Author          271047 non-null  object
 3   Year-Of-Publication  271047 non-null  int32 
 4   Publisher            271047 non-null  object
 5   Image-URL-M          271047 non-null  object
dtypes: int32(1), object(5)
memory usage: 11.4+ MB


In [90]:
books.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg


In [91]:
print(users.isna().sum())               

User-ID     0     
Location    0     
Age         110762
dtype: int64


In [92]:
print(sorted(list(users['Age'].unique())))

[nan, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 118.0, 119.0, 123.0, 124.0, 127.0, 128.0, 132.0, 133.0, 136.0, 137.0, 138.0, 140.0, 141.0, 143.0, 146.0, 147.0, 148.0, 151.0, 152.0, 156.0, 157.0, 159.0, 162.0, 168.0, 172.0, 175.0, 183.0, 186.0, 189.0, 199.0, 200.0, 201.0, 204.0, 207.0, 208.0, 209.0, 210.0, 212.0, 219.0, 220.0, 223.0, 226.0

In [93]:
required = users[users['Age'] <= 80]
required = required[required['Age'] >= 10]

In [94]:
mean = round(required['Age'].mean())

In [95]:
users.loc[users['Age'] > 80, 'Age'] = mean  
users.loc[users['Age'] < 10, 'Age'] = mean
users['Age'] = users['Age'].fillna(mean)
users['Age'] = users['Age'].astype(int)

In [96]:
users.drop(['Location'], axis=1, inplace=True)

In [97]:
users.drop_duplicates(keep='last', inplace=True)
users.reset_index(drop=True, inplace=True)

In [98]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   User-ID  278858 non-null  int64
 1   Age      278858 non-null  int32
dtypes: int32(1), int64(1)
memory usage: 3.2 MB


In [99]:
users.head(5)

Unnamed: 0,User-ID,Age
0,1,35
1,2,18
2,3,35
3,4,17
4,5,35


In [100]:
print("Columns: ", list(ratings.columns))
ratings.head()

Columns:  ['User-ID', 'ISBN', 'Book-Rating']


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [101]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [102]:
print(is_numeric_dtype(ratings['Book-Rating']))

True


In [103]:
print(is_numeric_dtype(ratings['User-ID']))

True


In [104]:
ratings['ISBN'] = ratings['ISBN'].str.upper()

In [105]:
ratings.drop_duplicates(keep='last', inplace=True)
ratings.reset_index(drop=True, inplace=True)

In [106]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [107]:
ratings.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [108]:
dataset = pd.merge(books, ratings, on='ISBN', how='inner')
dataset = pd.merge(dataset, users, on='User-ID', how='inner')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031273 entries, 0 to 1031272
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   ISBN                 1031273 non-null  object
 1   Book-Title           1031273 non-null  object
 2   Book-Author          1031273 non-null  object
 3   Year-Of-Publication  1031273 non-null  int32 
 4   Publisher            1031273 non-null  object
 5   Image-URL-M          1031273 non-null  object
 6   User-ID              1031273 non-null  int64 
 7   Book-Rating          1031273 non-null  int64 
 8   Age                  1031273 non-null  int32 
dtypes: int32(2), int64(2), object(5)
memory usage: 70.8+ MB


In [109]:
data_processed = dataset[dataset['Book-Rating'] != 0]
data_processed = data_processed.reset_index(drop = True)
data_processed.shape

(383892, 9)

POPULARITY BASED

In [110]:
def popularity_based(dataframe, n):
    n = 50
    data = pd.DataFrame(dataframe.groupby('ISBN')['Book-Rating'].count()).sort_values('Book-Rating', ascending=False).head(n)
    result = pd.merge(data, books, on='ISBN')
    return result


In [111]:
print("Top 50 popular books are: ")
popular_df=popularity_based(data_processed, 50)
popular_df

Top 50 popular books are: 


Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
0,0316666343,707,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.01.MZZZZZZZ.jpg
1,0971880107,581,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.01.MZZZZZZZ.jpg
2,0385504209,487,The Da Vinci Code,Dan Brown,2003,Doubleday,http://images.amazon.com/images/P/0385504209.01.MZZZZZZZ.jpg
3,0312195516,383,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA,http://images.amazon.com/images/P/0312195516.01.MZZZZZZZ.jpg
4,0060928336,320,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial,http://images.amazon.com/images/P/0060928336.01.MZZZZZZZ.jpg
5,059035342X,315,Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)),J. K. Rowling,1999,Arthur A. Levine Books,http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg
6,0142001740,307,The Secret Life of Bees,Sue Monk Kidd,2003,Penguin Books,http://images.amazon.com/images/P/0142001740.01.MZZZZZZZ.jpg
7,0446672211,295,Where the Heart Is (Oprah's Book Club (Paperback)),Billie Letts,1998,Warner Books,http://images.amazon.com/images/P/0446672211.01.MZZZZZZZ.jpg
8,044023722X,282,A Painted House,John Grisham,2001,Dell Publishing Company,http://images.amazon.com/images/P/044023722X.01.MZZZZZZZ.jpg
9,0452282152,278,Girl with a Pearl Earring,Tracy Chevalier,2001,Plume Books,http://images.amazon.com/images/P/0452282152.01.MZZZZZZZ.jpg


In [112]:
rating_avg = data_processed.groupby('Book-Title').mean()['Book-Rating'].reset_index()
rating_avg.rename(columns={'Book-Rating':'avg_rating'},inplace=True)
rating_avg

Unnamed: 0,Book-Title,avg_rating
0,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",9.000000
1,"Ask Lily (Young Women of Faith: Lily Series, Book 5)",8.000000
2,Dark Justice,10.000000
3,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",7.142857
4,Final Fantasy Anthology: Official Strategy Guide (Brady Games),10.000000
...,...,...
135563,Ã?Â?bernachten mit Stil. Die schÃ?Â¶nsten Country Inns und Bed und Breakfast der USA.,8.000000
135564,Ã?Â?rger mit Produkt X. Roman.,7.000000
135565,Ã?Â?sterlich leben.,7.000000
135566,Ã?Â?stlich der Berge.,8.000000


In [113]:
rating_avg_round = rating_avg.round(decimals = 2)
rating_avg_round


Unnamed: 0,Book-Title,avg_rating
0,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",9.00
1,"Ask Lily (Young Women of Faith: Lily Series, Book 5)",8.00
2,Dark Justice,10.00
3,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",7.14
4,Final Fantasy Anthology: Official Strategy Guide (Brady Games),10.00
...,...,...
135563,Ã?Â?bernachten mit Stil. Die schÃ?Â¶nsten Country Inns und Bed und Breakfast der USA.,8.00
135564,Ã?Â?rger mit Produkt X. Roman.,7.00
135565,Ã?Â?sterlich leben.,7.00
135566,Ã?Â?stlich der Berge.,8.00


In [114]:
popular_df = rating_avg_round.merge(popular_df,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','Book-Rating','avg_rating']].sort_values('Book-Rating', ascending=False)
popular_df


Unnamed: 0,Book-Title,Book-Author,Image-URL-M,Book-Rating,avg_rating
32,The Lovely Bones: A Novel,Alice Sebold,http://images.amazon.com/images/P/0316666343.01.MZZZZZZZ.jpg,707,8.19
49,Wild Animus,Rich Shapero,http://images.amazon.com/images/P/0971880107.01.MZZZZZZZ.jpg,581,4.39
25,The Da Vinci Code,Dan Brown,http://images.amazon.com/images/P/0385504209.01.MZZZZZZZ.jpg,487,8.44
40,The Red Tent (Bestselling Backlist),Anita Diamant,http://images.amazon.com/images/P/0312195516.01.MZZZZZZZ.jpg,383,8.18
6,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,http://images.amazon.com/images/P/0060928336.01.MZZZZZZZ.jpg,320,7.88
12,Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)),J. K. Rowling,http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg,315,8.94
41,The Secret Life of Bees,Sue Monk Kidd,http://images.amazon.com/images/P/0142001740.01.MZZZZZZZ.jpg,307,8.48
47,Where the Heart Is (Oprah's Book Club (Paperback)),Billie Letts,http://images.amazon.com/images/P/0446672211.01.MZZZZZZZ.jpg,295,8.14
1,A Painted House,John Grisham,http://images.amazon.com/images/P/044023722X.01.MZZZZZZZ.jpg,282,7.4
8,Girl with a Pearl Earring,Tracy Chevalier,http://images.amazon.com/images/P/0452282152.01.MZZZZZZZ.jpg,278,7.98


AVERAGE WEIGHT RATING

In [115]:
def toprated(dataframe):
    data = pd.DataFrame(dataframe.groupby('ISBN')['Book-Rating'].count())
    result = pd.merge(data, books, on='ISBN')
    return result
toprated=toprated(data_processed)
toprated

Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
0,0000913154,1,The Way Things Work: An Illustrated Encyclopedia of Technology,C. van Amerongen (translator),1967,Simon &amp; Schuster,http://images.amazon.com/images/P/0000913154.01.MZZZZZZZ.jpg
1,0001046438,1,Liar,Stephen Fry,2002,Harpercollins Uk,http://images.amazon.com/images/P/0001046438.01.MZZZZZZZ.jpg
2,000104687X,1,"T.S. Eliot Reading \The Wasteland\"" and Other Poems""",T.S. Eliot,1993,HarperCollins Publishers,http://images.amazon.com/images/P/000104687X.01.MZZZZZZZ.jpg
3,0001047213,1,The Fighting Man,Gerald Seymour,1993,HarperCollins Publishers,http://images.amazon.com/images/P/0001047213.01.MZZZZZZZ.jpg
4,0001047973,2,Brave New World,Aldous Huxley,1999,Trafalgar Square Publishing,http://images.amazon.com/images/P/0001047973.01.MZZZZZZZ.jpg
...,...,...,...,...,...,...,...
149713,B0001FZGPI,1,The Bonesetter's Daughter,Amy Tan,2001,Putnam Pub Group,http://images.amazon.com/images/P/B0001FZGPI.01.MZZZZZZZ.jpg
149714,B0001FZGRQ,1,The Clan of the Cave Bear,Jean M. Auel,2001,Crown Publishing Group,http://images.amazon.com/images/P/B0001FZGRQ.01.MZZZZZZZ.jpg
149715,B0001GMSV2,2,Find Me,Rosie O'Donnell,2002,Warner Books,http://images.amazon.com/images/P/B0001GMSV2.01.MZZZZZZZ.jpg
149716,B0001I1KOG,1,New York Public Library Literature Companion,New York Public Library,2001,Free Press,http://images.amazon.com/images/P/B0001I1KOG.01.MZZZZZZZ.jpg


In [116]:
toprated = toprated.merge(rating_avg,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','Book-Rating','avg_rating']]
toprated

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,Book-Rating,avg_rating
0,The Way Things Work: An Illustrated Encyclopedia of Technology,C. van Amerongen (translator),http://images.amazon.com/images/P/0000913154.01.MZZZZZZZ.jpg,1,8.000000
1,Liar,Stephen Fry,http://images.amazon.com/images/P/0001046438.01.MZZZZZZZ.jpg,1,7.500000
4,"T.S. Eliot Reading \The Wasteland\"" and Other Poems""",T.S. Eliot,http://images.amazon.com/images/P/000104687X.01.MZZZZZZZ.jpg,1,6.000000
5,The Fighting Man,Gerald Seymour,http://images.amazon.com/images/P/0001047213.01.MZZZZZZZ.jpg,1,9.000000
6,Brave New World,Aldous Huxley,http://images.amazon.com/images/P/0001047973.01.MZZZZZZZ.jpg,2,8.090909
...,...,...,...,...,...
149713,THE FIRST TARZAN OMNIBUS: Tarzan of the Apes; The Return of Tarzan; Jungle Tales of Tarzan,Edgar Rice Burroughs,http://images.amazon.com/images/P/B0000X8HIE.01.MZZZZZZZ.jpg,1,6.000000
149714,Modern Contemporary: Aspects of Art at MoMA Since 1980,P Ed Varnedoe/Antonelli Kick,http://images.amazon.com/images/P/B00013AX9E.01.MZZZZZZZ.jpg,1,6.000000
149715,Hornet Flight: A Novel,Ken Follett,http://images.amazon.com/images/P/B00014WRH4.01.MZZZZZZZ.jpg,1,8.000000
149716,"Big Four, The",Agatha Christie,http://images.amazon.com/images/P/B00019U4II.01.MZZZZZZZ.jpg,1,8.000000


In [117]:
m = toprated['Book-Rating'].quantile(0.90)
m

4.0

In [118]:
C = toprated['avg_rating'].mean()
print('The average rating of a book is', C,'/10.')

The average rating of a book is 7.522777438610857 /10.


In [119]:
def weighted_rating(x, m=m, C=C): 
    v = x['Book-Rating'] 
    R = x['avg_rating'] 
    return (v/(v+m) * R) + (m/(m+v) * C)

In [120]:
toprated = toprated[toprated['Book-Rating'] >= m]
toprated['score'] = toprated.apply(weighted_rating, axis=1)
toprated = toprated.sort_values('score', ascending=False)
toprated['score'] = toprated['score'].round(decimals = 2)
toprated['avg_rating'] = toprated['avg_rating'].round(decimals = 2)

In [121]:
toprated1 = toprated.head(50)
toprated1

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,Book-Rating,avg_rating,score
51697,Harry Potter and the Chamber of Secrets Postcard Book,J. K. Rowling,http://images.amazon.com/images/P/0439425220.01.MZZZZZZZ.jpg,23,9.87,9.52
105697,Dilbert: A Book of Postcards,Scott Adams,http://images.amazon.com/images/P/0836213319.01.MZZZZZZZ.jpg,13,9.92,9.36
133284,Postmarked Yesteryear: 30 Rare Holiday Postcards,Pamela E. Apkarian-Russell,http://images.amazon.com/images/P/1888054557.01.MZZZZZZZ.jpg,11,10.0,9.34
105785,Calvin and Hobbes,Bill Watterson,http://images.amazon.com/images/P/0836220889.01.MZZZZZZZ.jpg,24,9.58,9.29
105741,The Authoritative Calvin and Hobbes (Calvin and Hobbes),Bill Watterson,http://images.amazon.com/images/P/0836218221.01.MZZZZZZZ.jpg,20,9.6,9.25
27845,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339711.01.MZZZZZZZ.jpg,83,9.33,9.25
90505,"My Sister's Keeper : A Novel (Picoult, Jodi)",Jodi Picoult,http://images.amazon.com/images/P/0743454529.01.MZZZZZZZ.jpg,22,9.55,9.23
75490,"The Return of the King (The Lord of The Rings, Part 3)",J. R. R. Tolkien,http://images.amazon.com/images/P/0618002243.01.MZZZZZZZ.jpg,16,9.62,9.2
45734,The Sneetches and Other Stories,Dr. Seuss,http://images.amazon.com/images/P/0394800893.01.MZZZZZZZ.jpg,8,10.0,9.17
24093,Kiss of the Night (A Dark-Hunter Novel),Sherrilyn Kenyon,http://images.amazon.com/images/P/0312992416.01.MZZZZZZZ.jpg,10,9.8,9.15


In [122]:
import pickle
pickle.dump(popular_df, open('popular.pkl','wb'))
pickle.dump(toprated1, open('toprated1.pkl','wb'))

Collaborative Filtering (User-Item Filtering)

user item collaborative filtering

In [123]:
df = ratings.merge(books,on='ISBN')


In [124]:
x = df.groupby('User-ID').count()['Book-Rating'] > 200
newx = x[x].index
newx

Int64Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,
              6323,   6543,
            ...
            271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427,
            277639, 278418],
           dtype='int64', name='User-ID', length=811)

In [125]:
newdf = df[df['User-ID'].isin(newx)]
newdf

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.01.MZZZZZZZ.jpg
5,23768,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.01.MZZZZZZZ.jpg
7,28523,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.01.MZZZZZZZ.jpg
15,77940,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.01.MZZZZZZZ.jpg
16,81977,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.01.MZZZZZZZ.jpg
...,...,...,...,...,...,...,...,...
1031022,275970,1880837927,0,The Theology of the Hammer,Millard Fuller,1994,Smyth &amp; Helwys Publishing,http://images.amazon.com/images/P/1880837927.01.MZZZZZZZ.jpg
1031023,275970,188717897X,0,"The Ordeal of Integration: Progress and Resentment in America's \Racial\"" Crisis (Ordeal of Integration)""",Orlando Patterson,1998,Civitas Book Publisher,http://images.amazon.com/images/P/188717897X.01.MZZZZZZZ.jpg
1031024,275970,1888889047,0,Pushcart's Complete Rotten Reviews &amp; Rejections,Bill Henderson,1998,Pushcart Press,http://images.amazon.com/images/P/1888889047.01.MZZZZZZZ.jpg
1031025,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventures of a Mountain Man Wannabe (Capital Discoveries Book),Mike Tougias,2002,Capital Books (VA),http://images.amazon.com/images/P/1931868123.01.MZZZZZZZ.jpg


In [126]:
y = newdf.groupby('Book-Title').count()['Book-Rating']>=50
newy = y[y].index
newy

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=707)

In [127]:
df = newdf[newdf['Book-Title'].isin(newy)]
df

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
63,278418,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.01.MZZZZZZZ.jpg
65,3363,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.01.MZZZZZZZ.jpg
66,7158,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.01.MZZZZZZZ.jpg
69,11676,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.01.MZZZZZZZ.jpg
74,23768,0446520802,6,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.01.MZZZZZZZ.jpg
...,...,...,...,...,...,...,...,...
1026871,266865,0531001725,10,The Catcher in the Rye,Jerome David Salinger,1973,Scholastic Library Pub,http://images.amazon.com/images/P/0531001725.01.MZZZZZZZ.jpg
1028067,269566,0670809381,0,Echoes,Maeve Binchy,1986,Penguin USA,http://images.amazon.com/images/P/0670809381.01.MZZZZZZZ.jpg
1028921,271284,0440910927,0,The Rainmaker,John Grisham,1995,Island,http://images.amazon.com/images/P/0440910927.01.MZZZZZZZ.jpg
1029214,271705,B0001PIOX4,0,Fahrenheit 451,Ray Bradbury,1993,Simon &amp; Schuster,http://images.amazon.com/images/P/B0001PIOX4.01.MZZZZZZZ.jpg


In [128]:
pt = df.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')
pt


User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,,,,9.0,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,0.0,...,,,,,,0.0,,,0.0,
4 Blondes,,,,,,,,0.0,,,...,,,,,,,,,,
A Bend in the Road,0.0,,7.0,,,,,,,,...,,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,,0.0,...,,9.0,,,,,0.0,,,
You Belong To Me,,,,,,,,,0.0,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,0.0,,,...,,,,,,,0.0,,,
Zoya,,,,,,,,,,,...,,0.0,,,,,,,,


In [129]:
pt.fillna(0,inplace=True)
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores_useritem = cosine_similarity(pt)
similarity_scores_useritem

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

In [131]:
def recommend(book_name):
    index = np.where(pt.index==book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores_useritem[index])),key=lambda x:x[1],reverse=True)[1:10]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)
    
    return data


In [132]:
pickle.dump(similarity_scores_useritem,open('user_item.pkl','wb'))
pickle.dump(books,open('books.pkl','wb'))
pickle.dump(pt,open('pt.pkl','wb'))

Books By Same Authors, Publishers

In [133]:
def get_books(dataframe, name):
    print("\nBooks by same Author:\n")
    au = dataframe['Book-Author'].unique()

    data = data_processed[data_processed['Book-Title'] != name]

    if au[0] in list(data['Book-Author'].unique()):
        k2 = data[data['Book-Author'] == au[0]]
    k2 = k2.sort_values(by=['Book-Rating']).drop_duplicates(subset=['Book-Title'])
    k3 = k2.head(10)
    k3 = k3.merge(rating_avg_round,on='Book-Title')

    print(k3)


In [134]:
pickle.dump(data_processed,open('data_processed.pkl','wb'))
pickle.dump(rating_avg_round,open('rating_avg_round.pkl','wb'))


In [135]:
def get_books1(dataframe, name):
    pu = dataframe['Publisher'].unique()

    data = data_processed[data_processed['Book-Title'] != name]
    if pu[0] in list(data['Publisher'].unique()):
        k4 = data[data['Publisher'] == pu[0]]
    k4 = k4.sort_values(by=['Book-Rating']).drop_duplicates(subset=['Book-Title'])
    k5 = k4.head(10)
    k5 = k5.merge(rating_avg_round,on='Book-Title')
    print(k5)


In [136]:
bookName = input("Enter a book name: ")
       
if bookName in list(data_processed['Book-Title'].unique()):
    d = data_processed[data_processed['Book-Title'] == bookName]
    get_books1(d, bookName)
else:
    print("Invalid Book Name!!")

         ISBN  \
0  0452281326   
1  0452266564   
2  0452273587   
3  0452278902   
4  0452280524   
5  0452273749   
6  0452280621   
7  0452268060   
8  0452281881   
9  0452283728   

                                                                                  Book-Title  \
0  The Feeling Good Handbook                                                                   
1  The Claiming of Sleeping Beauty (Erotic Adventures of Sleeping Beauty)                      
2  Guardian Angels and Spirit Guides: True Accounts of Benevolent Beings from the Other Side   
3  The Green Mile: The Complete Serial Novel                                                   
4  New Ideas from Dead Economists: An Introduction to Modern Economic Thought                  
5  Haunted: Tales of the Grotesque                                                             
6  Beloved                                                                                     
7  How the Garcia Girls Lost Their Accents (