In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px
import plotly.graph_objects as go

data = pd.read_csv("books_data.csv", nrows=20000)
data.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,1/1/2005,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,3/1/2003,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [2]:
data.columns

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'publishedDate', 'infoLink', 'categories', 'ratingsCount'],
      dtype='object')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Title          19999 non-null  object 
 1   description    14894 non-null  object 
 2   authors        18897 non-null  object 
 3   image          16670 non-null  object 
 4   previewLink    19916 non-null  object 
 5   publisher      14146 non-null  object 
 6   publishedDate  19678 non-null  object 
 7   infoLink       19916 non-null  object 
 8   categories     17796 non-null  object 
 9   ratingsCount   4732 non-null   float64
dtypes: float64(1), object(9)
memory usage: 1.5+ MB


In [4]:
# fig = px.histogram(data, x='ratingsCount', 
#                    nbins=30, 
#                    title='Distribution of Average Ratings')
# fig.update_xaxes(title_text='Average Rating')
# fig.update_yaxes(title_text='Frequency')
# fig.show()

In [5]:
# top_authors = data['authors'].value_counts().head(10)
# fig = px.bar(top_authors, x=top_authors.values, y=top_authors.index, orientation='h',
#              labels={'x': 'Number of Books', 'y': 'Author'},
#              title='Number of Books per Author')
# fig.show()

In [6]:
# Convert 'average_rating' to a numeric data type
data['ratingsCount'] = pd.to_numeric(data['ratingsCount'], 
                                       errors='coerce')

In [7]:
# Create a new column 'book_content' by combining 'title' and 'authors'
data['book_content'] = (data['Title'] + ' ') * 2 + data['description'] + ' ' + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

In [8]:
data.head(n=5)

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,book_content
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],,
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,1/1/2005,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],,Dr. Seuss: American Icon Dr. Seuss: American I...
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],,Wonderful Worship in Smaller Churches Wonderfu...
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],,Whispers of the Wicked Saints Whispers of the ...
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,3/1/2003,http://books.google.nl/books?id=399SPgAACAAJ&d...,,,


In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['book_content'].values.astype('U'))

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
# Compute the cosine similarity between books
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [11]:
def recommend_books(book_title, threshold, cosine_sim=cosine_sim):
    # Get the index of the book that matches the title
    idx = data[data['Title'] == book_title].index[0]

    # Get the cosine similarity scores for all books with this book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [(i, "{:.5f}".format(score)) for i, score in sim_scores[1:] if score >= threshold]

    # Get the most similar books in sorted sequence (excluding the input book, index 0 is the input book)
    # sim_scores = sim_scores[0:20]

    # Get the book titles and their similarity scores
    book_recommendations = [(data['Title'].iloc[i[0]], i[1]) for i in sim_scores]

    return book_recommendations

In [None]:
from IPython.display import clear_output
book_title = ''
while (book_title != 'q'):
  clear_output(wait=True)
  book_title = input("Enter the title of a book: ")
  recommended_books = recommend_books(book_title, threshold=0.1)
  f = open('output.txt', 'w')
  f.write('Counts: ' + str(len(recommended_books)) + '\n\n')
  for book in recommended_books:
    f.write(book[1] + ' | ' + str(book[0]) + '\n')
  f.close()
  print('Found: ' + str(len(recommended_books)))

Found: 55
