# CAI 2820C - AI Applications Solutions

## Spring 2025

## Project 1 - Build a Streamlit App for Categorizing Books

## Instructor: Claudio S. Castillo 

## Setting up the environment

In [9]:
!pip install pandas scikit-learn nltk joblib openpyxl



In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans  # Replace NMF with KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

import numpy as np
import joblib
import os


## Collecting Data

In [15]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kalop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kalop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Load dataset
dataset_path = "AllITBooks_DataSet.xlsx" 
data = pd.read_excel(dataset_path)

In [17]:
# Fill missing values in the subtitle
data["Sub_title"].fillna("", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Sub_title"].fillna("", inplace=True)


In [18]:
# Combine important text fields into a single text column
data["ConsolidatedText"] = data["Book_name"] + " " + data["Sub_title"] + " " + data["Description"]

# Print the first few rows to verify
data.head()

Unnamed: 0.1,Unnamed: 0,Book_name,Sub_title,Author,Year,Pages,Language,Size,Format,Description,Category,ConsolidatedText
0,0,"Pro ASP.NET Core 3, 8th. Edition",Develop Cloud-Ready Web Applications Using MVC...,Adam Freeman,2020,1400,English,38.3 MB,"PDF, ePub",\nBook Description:\nThis bestselling comprehe...,ASP.NET,"Pro ASP.NET Core 3, 8th. Edition Develop Cloud..."
1,1,Modern Data Mining Algorithms in C++ and CUDA C,Recent Developments in Feature Extraction and ...,Timothy Masters,2020,237,English,2.3 MB,"PDF, ePub",\nBook Description:\nDiscover a variety of dat...,C & C++,Modern Data Mining Algorithms in C++ and CUDA ...
2,2,SAS Stored Processes,A Practical Guide to Developing Web Applications,Philip Mason,2020,338,English,11.2 MB,"PDF, ePub",\nBook Description:\nCustomize the SAS Stored ...,Software,SAS Stored Processes A Practical Guide to Deve...
3,3,Advanced Perl Programming,From Advanced to Expert,"William ""Bo"" Rothwell",2020,308,English,4.9 MB,"PDF, ePub",\nBook Description:\nWilliam “Bo” Rothwell’s A...,Perl,Advanced Perl Programming From Advanced to Exp...
4,4,Articulate Storyline Essentials,Discover Articulate Storyline's ability to enh...,Ashley Chiasson,2015,180,English,8.8 MB,PDF,\nBook Description:\nStoryline is a powerful e...,Computers & Technology,Articulate Storyline Essentials Discover Artic...


In [19]:
data["ConsolidatedText"]

0       Pro ASP.NET Core 3, 8th. Edition Develop Cloud...
1       Modern Data Mining Algorithms in C++ and CUDA ...
2       SAS Stored Processes A Practical Guide to Deve...
3       Advanced Perl Programming From Advanced to Exp...
4       Articulate Storyline Essentials Discover Artic...
                              ...                        
8553    Dreamweaver CS6 Mobile and Web Development wit...
8554    Beginning Amazon Web Services with Node.js  \n...
8555    Pro Grunt.js  \nBook Description:\nPro Grunt.j...
8556    MongoDB Cookbook Over 80 practical recipes to ...
8557    Foundation HTML5 with CSS3 A Modern Guide and ...
Name: ConsolidatedText, Length: 8558, dtype: object

In [20]:
stop_words = set(stopwords.words('english'))
list(stop_words)[:5]

['under', 'where', 'wasn', 'o', 'can']

## Cleaning text

In [21]:
# Function to preprocess text
def preprocess_text(text):

    text = str(text).lower()  # Convert to lowercase

    tokens = word_tokenize(text)  # Tokenize text

    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove stopwords and non-alphanumeric tokens
    
    return " ".join(filtered_tokens)

In [22]:
# Apply the cleaning function to the dataset
data["CleanedDescription"] = data["ConsolidatedText"].apply(preprocess_text)

In [25]:
# Display before and after cleaning
data[["ConsolidatedText","CleanedDescription"]]

Unnamed: 0,ConsolidatedText,CleanedDescription
0,"Pro ASP.NET Core 3, 8th. Edition Develop Cloud...",pro core 3 8th edition develop web application...
1,Modern Data Mining Algorithms in C++ and CUDA ...,modern data mining algorithms cuda c recent de...
2,SAS Stored Processes A Practical Guide to Deve...,sas stored processes practical guide developin...
3,Advanced Perl Programming From Advanced to Exp...,advanced perl programming advanced expert book...
4,Articulate Storyline Essentials Discover Artic...,articulate storyline essentials discover artic...
...,...,...
8553,Dreamweaver CS6 Mobile and Web Development wit...,dreamweaver cs6 mobile web development html5 c...
8554,Beginning Amazon Web Services with Node.js \n...,beginning amazon web services book description...
8555,Pro Grunt.js \nBook Description:\nPro Grunt.j...,pro book description pro gets quickly popular ...
8556,MongoDB Cookbook Over 80 practical recipes to ...,mongodb cookbook 80 practical recipes design d...


## Creating a word embedding

In [26]:
# Define TF-IDF Vectorizer with a feature limit
tfidf_vectorizer = TfidfVectorizer(max_features=500)

# Transform the cleaned descriptions into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(data["CleanedDescription"])

In [27]:
# Get feature names (words in the vocabulary)
feature_names = tfidf_vectorizer.get_feature_names_out()

In [28]:
# Convert the TF-IDF matrix to a DataFrame for easier visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

In [29]:
# Display the first few rows of the TF-IDF matrix
tfidf_df.head()

Unnamed: 0,2010,2012,2013,2nd,3d,able,access,across,action,add,...,wordpress,work,working,works,world,write,writing,written,xml,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.042718,0.0,0.0,0.0,0.0,0.046103,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.060617,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.072213,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.065731,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Modeling 

In [30]:
# Define the number of clusters (same as the original NMF components)
num_clusters = 10  

# Train a K-Means clustering model
kmeans_model = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans_model.fit(tfidf_matrix)

# Assign each book a cluster
data["Cluster"] = kmeans_model.predict(tfidf_matrix)

# Display cluster assignments
data[["Book_name", "Cluster"]].head()


Unnamed: 0,Book_name,Cluster
0,"Pro ASP.NET Core 3, 8th. Edition",9
1,Modern Data Mining Algorithms in C++ and CUDA C,0
2,SAS Stored Processes,9
3,Advanced Perl Programming,3
4,Articulate Storyline Essentials,3


## Get topics 

In [32]:
# Retrieve feature names from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Get cluster centers (each cluster's representative word importance)
cluster_centers = kmeans_model.cluster_centers_

# Extract keywords for each cluster
topics = []

for cluster_idx, cluster_center in enumerate(cluster_centers):
    # Get the top 10 words that define the cluster
    top_words = [feature_names[i] for i in cluster_center.argsort()[:-11:-1]]
    print(f"Cluster {cluster_idx}: {', '.join(top_words)}")
    topics.append(", ".join(top_words))

Cluster 0: data, python, analysis, book, learning, big, machine, learn, using, programming
Cluster 1: game, games, unity, development, create, engine, book, 3d, learn, android
Cluster 2: java, programming, book, applications, web, edition, development, application, language, using
Cluster 3: book, programming, learn, using, use, applications, description, guide, design, software
Cluster 4: oracle, sql, database, server, data, book, performance, databases, 2012, business
Cluster 5: windows, microsoft, sharepoint, excel, office, 2010, 2013, book, new, server
Cluster 6: network, security, networks, book, linux, wireless, networking, systems, system, secure
Cluster 7: exam, study, certification, practice, guide, questions, review, microsoft, security, cisco
Cluster 8: android, ios, apps, app, iphone, swift, mobile, development, book, apple
Cluster 9: web, javascript, applications, book, html5, application, development, css, build, jquery


In [33]:
categories = {
    0: "Data Science and Machine Learning",
    1: "Game Development",
    2: "Java Programming and Web Applications",
    3: "General Programming and Software Development",
    4: "Databases and SQL Administration",
    5: "Microsoft Technologies (Windows, Office, SharePoint)",
    6: "Networking and Cybersecurity",
    7: "Certification and Exam Preparation",
    8: "Mobile App Development (Android & iOS)",
    9: "Web Development (HTML, CSS, JavaScript)"
}

In [34]:
# Assign categories based on K-Means clustering
data["AssignedCluster"] = kmeans_model.predict(tfidf_matrix)
data["Topic_Keywords"] = [topics[i] for i in data["AssignedCluster"]]
data["Topic"] = data["AssignedCluster"].map(categories)

In [35]:
# Display the first few assigned topics
data[["Book_name", "Topic", "Topic_Keywords"]].head()

Unnamed: 0,Book_name,Topic,Topic_Keywords
0,"Pro ASP.NET Core 3, 8th. Edition","Web Development (HTML, CSS, JavaScript)","web, javascript, applications, book, html5, ap..."
1,Modern Data Mining Algorithms in C++ and CUDA C,Data Science and Machine Learning,"data, python, analysis, book, learning, big, m..."
2,SAS Stored Processes,"Web Development (HTML, CSS, JavaScript)","web, javascript, applications, book, html5, ap..."
3,Advanced Perl Programming,General Programming and Software Development,"book, programming, learn, using, use, applicat..."
4,Articulate Storyline Essentials,General Programming and Software Development,"book, programming, learn, using, use, applicat..."


In [37]:
print(data.Description[0])


Book Description:
This bestselling comprehensive guide to ASP.NET Core is the only book you need for ASP.NET Core development. Period.
Professional developers will produce leaner applications for the ASP.NET Core platform using the guidance in this full-color book, now in its 8th edition and updated for ASP.NET Core 3. It contains detailed explanations of the ASP.NET Core platform and the application frameworks it supports. This edition puts ASP.NET Core 3 into context and dives deep into the tools and techniques required to build modern, extensible, web applications. New features and capabilities such as MVC 3, Razor Pages, Blazor Server, and Blazor WebAssembly are covered, along with demonstrations of how they are applied.
ASP.NET Core 3 is the latest evolution of Microsoft’s ASP.NET web platform and provides a “host-agnostic” framework and a high-productivity programming model that promotes cleaner code architecture, test-driven development, and powerful extensibility.
Best-selling

## Predicting a new category based on a book description

In [39]:
bookdescription = """As data floods into your company, you need to put it to work right away―and SQL is the best tool for the job. With the latest edition of this introductory guide, author Alan Beaulieu helps developers get up to speed with SQL fundamentals for writing database applications, performing administrative tasks, and generating reports. You’ll find new chapters on SQL and big data, analytic functions, and working with very large databases.

Each chapter presents a self-contained lesson on a key SQL concept or technique using numerous illustrations and annotated examples. Exercises let you practice the skills you learn. Knowledge of SQL is a must for interacting with data. With Learning SQL, you’ll quickly discover how to put the power and flexibility of this language to work.

Move quickly through SQL basics and several advanced features
Use SQL data statements to generate, manipulate, and retrieve data
Create database objects, such as tables, indexes, and constraints with SQL schema statements
Learn how datasets interact with queries; understand the importance of subqueries
Convert and manipulate data with SQL’s built-in functions and use conditional logic in data statements"""

In [40]:

# Clean the book description
bookdescription_cleaned = preprocess_text(bookdescription)

# Convert the cleaned text into TF-IDF features
bookdescription_vectorized = tfidf_vectorizer.transform([bookdescription_cleaned])

# Predict the cluster
predicted_cluster = kmeans_model.predict(bookdescription_vectorized)[0]

# Get distances to all clusters
distances = kmeans_model.transform(bookdescription_vectorized)[0]

# Get the top 3 most related clusters
closest_clusters = distances.argsort()[:3]

# Get the main category and two related topics
book_category = categories.get(closest_clusters[0], "Unknown")
related_topics = [categories.get(closest_clusters[i], "Unknown") for i in range(1, 3)]

# Display the results
print(f"Book Category: {book_category}")
print(f"Related Topics: {', '.join(related_topics)}")

Book Category: Databases and SQL Administration
Related Topics: Data Science and Machine Learning, General Programming and Software Development


In [41]:
# Convert vectorized features into a DataFrame for visualization
pd.DataFrame(bookdescription_vectorized.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

Unnamed: 0,2010,2012,2013,2nd,3d,able,access,across,action,add,...,wordpress,work,working,works,world,write,writing,written,xml,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.10188,0.060861,0.0,0.0,0.0,0.074815,0.0,0.0,0.0


In [42]:
# Create a folder for saved models
if not os.path.exists("models"):
    os.mkdir("models")

# Save TF-IDF vectorizer and K-Means model
joblib.dump(tfidf_vectorizer, "models/tfidf_vectorizer.pkl")
joblib.dump(kmeans_model, "models/kmeans_model.pkl")


['models/kmeans_model.pkl']

## Working with ipywidgets

In [43]:
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import display

In [44]:
# Updated categorizeBooks function using K-Means
def categorizeBooks(bookdescription):
    # Clean the input text
    bookdescription_cleaned = preprocess_text(bookdescription)

    # Convert to TF-IDF features
    bookdescription_vectorized = tfidf_vectorizer.transform([bookdescription_cleaned])

    # Predict the primary category
    predicted_cluster = kmeans_model.predict(bookdescription_vectorized)[0]

    # Get distances to all clusters
    distances = kmeans_model.transform(bookdescription_vectorized)[0]

    # Identify the top 3 most related clusters
    closest_clusters = np.argsort(distances)[:3]

    # Assign category labels
    main_category = categories.get(closest_clusters[0], "Unknown")
    related_topics = [categories.get(closest_clusters[i], "Unknown") for i in range(1, 3)]

    return f"Category: {main_category} | Related Topics: {', '.join(related_topics)}"

In [45]:
# Create an interactive widget for categorization
@interact(book_description=widgets.Textarea(
    value="",
    placeholder="Type a book description...",
    description="Book Description:",
    layout=widgets.Layout(width="500px", height="150px")
))
def interactive_categorize(book_description):
    if book_description.strip():
        category = categorizeBooks(book_description)
        print(category)
    else:
        print("Please enter a valid book description.")

interactive(children=(Textarea(value='', description='Book Description:', layout=Layout(height='150px', width=…