# Capstone **AI** Book Worm Library
**David Kanwisher**

## Install dependencies

In [None]:
!pip3 install pandas
!pip3 install -U scikit-learn
!pip3 install nltk
!pip3 install matplotlib
!pip3 install ipywidgets
!pip3 install widgetsnbextension
!pip3 install wordcloud

## Import functionality from dependencies

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from IPython.core.display import display, HTML, clear_output
from wordcloud import WordCloud
import pandas as pd
import nltk
import ipywidgets as widgets
import matplotlib.pyplot as plt
import ast

## Download helper data

In [None]:
nltk.download('punkt') # Sentence tokenization
nltk.download('stopwords') # Words that don't contribute to pattern matching

## Use a data frame to read the CSV, Clean up data, show table

In [None]:
df = pd.read_csv('book-data.csv')
df = df.drop('Unnamed: 0', axis=1) # remove a column that is a duplicate of the existing index
df.head()

## A Histogram to show the distribution of books by their rating
Visual 1 of 3



In [None]:
plt.hist(df['Avg_Rating'], bins=100, edgecolor='black')
plt.xlabel('Average Rating')
plt.ylabel('Book Count')
plt.title('Distribution of Books by Average Rating')
plt.show()

## Create a method to tokenize the description text, save only words that are not stop words

In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    words = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return ' '.join(words)

## Create a new column that is the processed/tokenized description

In [None]:
# Apply the preprocessing to each description
df['Tokenized_Description'] = df['Description'].astype('str').apply(preprocess_text)

df.head()

## Use KMeans to cluster data based on the tokenized description, save the results to a "Cluster" column

In [None]:
transformed = TfidfVectorizer().fit_transform(df['Tokenized_Description'])
# Number of clusters has been lowered to speed up processing time for evaluation
k = 5

kmeans = KMeans(n_clusters=k, random_state=41, n_init=10).fit(transformed)

df['Cluster'] = kmeans.labels_

## Barcharts of the 3 largest clusters with the top 10 genres found in each cluster
Visual 2 of 3

In [None]:

df['Literal_Genres'] = df['Genres'].apply(ast.literal_eval)
df_exploded = df.explode('Literal_Genres')
genre_counts_by_cluster = df_exploded.groupby('Cluster')['Literal_Genres'].value_counts()

# Identify the top three largest clusters
largest_clusters = df['Cluster'].value_counts().nlargest(3).index

# Plot the top 10 genres for each cluster
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

for i, cluster in enumerate(largest_clusters):
    top_genres = genre_counts_by_cluster[cluster].head(10)
    top_genres.plot(kind='bar', ax=axes[i])
    axes[i].set_title(f'Top 10 Genres in Cluster #{cluster} of {k}')
    axes[i].set_xlabel('Genre')
    axes[i].set_ylabel('Number of Books')

plt.tight_layout()
plt.show()



## Create instances of the widgets and the logic to render (and re-render) the users query. Includes error handling. Uses a combo box to offer suggestions while typing.

## Runs the application (interactive UI)


*   Type in a book and look for a match in the suggestions
*   Click Search
*   Top 5 results will show, ordered by average review
*   New books can be searched and new results will be displayed



In [None]:
combo_box = widgets.Combobox(options=df['Book'].tolist(), placeholder="Enter a book title")
button = widgets.Button(description="Search")
button.on_click(lambda x: get_cluster(combo_box.value))
result = pd.DataFrame() # initialize as an empty data frame

output = widgets.Output()
def renderSearch():
  display(HTML('<h1>Book Worm Public Library Recommendation Search</h1>'))
  display(combo_box)
  display(button, output)

def get_cluster(combo_value):
  # provide a friendly message if user provides no data at all
  if not combo_value:
    print('You must enter a book title to continue.')
    return None
  else:
    book = df.loc[df['Book'] == combo_box.value]
    # provide a friendly message if the text the user provided is not a part of the book list
    if book.empty:
      print(f"No book was found by the name '{combo_value}', you must select a match from the search results.")
      return None
    else:
      global result
      cluster_num = book.Cluster.item()
      cluster = df.loc[(df['Cluster'] == cluster_num) & (df.index != book.index.item())]
      result = cluster.sample(5).sort_values(by='Avg_Rating', ascending=False)
      formatted_result = result[['Book', 'Author', 'Description', 'Genres', 'Avg_Rating',]]
      with output:
          clear_output(True)
          display(HTML('<br/><h2>Recommendations for ' + combo_box.value + ':</h2>'))
          display(formatted_result.rename(columns={'Avg_Rating': 'Average Rating'}))

renderSearch()


## Create a word cloud based on the given recommendations
Visualization 3 of 3

In [None]:
if not result.empty:
  concatenated_descriptions = ' '.join(result['Tokenized_Description'])
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(concatenated_descriptions)
  plt.figure(figsize=(12, 8))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')
  plt.title('Word Frequency in Recommended Books')
  plt.show()
else: print('A query in the previous step is required to display the word cloud')