In [1]:
# Necessary Libraries for Text Classification
from dotenv import load_dotenv

import numpy as np
import pandas as pd

import torch
from tqdm import tqdm
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constant Variable
BOOKS_DATASET_PATH = "./dataset/books_cleaned.csv"

In [3]:
# Dataset Setup for Data Exploration
books_dataset = pd.read_csv(BOOKS_DATASET_PATH)

In [4]:
# Dataset Overview
books_dataset["categories"].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
474,Conspiracies,1
475,Brothers and sisters,1
476,Rock musicians,1
477,Community life,1


In [5]:
# Map the categories to either Fiction or Non Fiction
category_mapping = {"Fiction": "Fiction", "Juvenile Fiction": "Children's Fiction", "Biography & Autobiography": "NonFiction", "History": "NonFiction", "Literary Criticism": "NonFiction", "Philosophy": "NonFiction", 
                    "Religion": "NonFiction", "Comics & Graphic Novels": "Fiction", "Drama": "Fiction", "Juvenile Nonfiction": "Children's NonFiction", "Science": "NonFiction", "Poetry": "Fiction"}

books_dataset["simple_categories"] = books_dataset["categories"].map(category_mapping)
books_dataset.head(10)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982: A new 'Christie for Christmas' ...,
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897: Lewis' work on the nature of lo...,
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934: ""In The Problem of Pain, C.S. L...",
5,9780006380832,0006380832,Empires of the Monsoon,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0,Empires of the Monsoon: A History of the India...,9780006380832: Until Vasco da Gama discovered ...,
6,9780006470229,000647022X,The Gap Into Madness,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",http://books.google.com/books/content?id=4oXav...,A new-cover reissue of the fourth book in the ...,1994.0,4.15,743.0,103.0,The Gap Into Madness: Chaos and Order,9780006470229: A new-cover reissue of the four...,
7,9780006472612,0006472613,Master of the Game,Sidney Sheldon,Adventure stories,http://books.google.com/books/content?id=TkTYp...,Kate Blackwell is an enigma and one of the mos...,1982.0,4.11,489.0,43540.0,Master of the Game,9780006472612: Kate Blackwell is an enigma and...,
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079: Tricked once more by his wily h...,Fiction
9,9780006483014,0006483011,The Once and Future King,Terence Hanbury White,Arthurian romances,http://books.google.com/books/content?id=Jx6Bv...,An omnibus volume of the author's complete sto...,1996.0,4.04,823.0,2805.0,The Once and Future King,9780006483014: An omnibus volume of the author...,


In [6]:
books_dataset[~(books_dataset["simple_categories"].isna())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin...",Fiction
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079: Tricked once more by his wily h...,Fiction
30,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,Ocean Star Express,9780006646006: Joe and his parents are enjoyin...,Children's Fiction
46,9780007121014,0007121016,Taken at the Flood,Agatha Christie,Fiction,http://books.google.com/books/content?id=3gWlx...,A Few Weeks After Marrying An Attractive Young...,2002.0,3.71,352.0,8852.0,Taken at the Flood,9780007121014: A Few Weeks After Marrying An A...,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,9781933648279,1933648279,Night Has a Thousand Eyes,Cornell Woolrich,Fiction,http://books.google.com/books/content?id=3Gk6s...,"""Cornell Woolrich's novels define the essence ...",2007.0,3.77,344.0,680.0,Night Has a Thousand Eyes,"9781933648279: ""Cornell Woolrich's novels defi...",Fiction
5188,9784770028969,4770028962,Coin Locker Babies,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,Coin Locker Babies,9784770028969: Rescued from the lockers in whi...,Fiction
5189,9788122200850,8122200850,"Cry, the Peacock",Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,"Cry, the Peacock",9788122200850: This book is the story of a you...,Fiction
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535: This collection of the timeless...,NonFiction


In [7]:
# Setup the pipeline of Zero Shot Classification from Hugging Face
pipe = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli", device = "mps")

Device set to use mps


In [8]:
# Define the categories and sequence for classification
books_categories = ["Fiction", "NonFiction"]
sequence = books_dataset.loc[books_dataset["simple_categories"] == "Fiction", "description"].reset_index(drop = True)[0]

pipe(sequences = sequence, candidate_labels = books_categories)

{'sequence': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst

In [9]:
# Extract the array which consists of the scores
max_index = np.argmax(pipe(sequences = sequence, candidate_labels = books_categories)["scores"])

# Identify the label with the maximum score based on the sequences
max_label = pipe(sequences = sequence, candidate_labels = books_categories)["labels"][max_index]
max_label

'Fiction'

In [10]:
def generate_predictions(sequence, categories):
    predictions = pipe(sequences = sequence, candidate_labels = categories)
    
    # Extract the array which consists of the scores
    max_index = np.argmax(predictions["scores"])
    
    # Identify the label with the maximum score based on the sequences
    max_label = predictions["labels"][max_index]
    
    return max_label

In [11]:
act_cats, pre_cats = [], []

for idx in tqdm(range(0, 300)):
    sequence = books_dataset.loc[books_dataset["simple_categories"] == "Fiction", "description"].reset_index(drop = True)[idx]
    pre_cats += [generate_predictions(sequence = sequence, categories = books_categories)]
    act_cats += ["Fiction"]
    
for idx in tqdm(range(0, 300)):
    sequence = books_dataset.loc[books_dataset["simple_categories"] == "NonFiction", "description"].reset_index(drop = True)[idx]
    pre_cats += [generate_predictions(sequence = sequence, categories = books_categories)]
    act_cats += ["NonFiction"]

100%|██████████| 300/300 [00:52<00:00,  5.74it/s]
100%|██████████| 300/300 [00:45<00:00,  6.62it/s]


In [12]:
# Convert the actual and predicted categories to a DataFrame
predictions_df = pd.DataFrame({"actual_categories": act_cats, "predicted_categories": pre_cats})
predictions_df.head(10)

Unnamed: 0,actual_categories,predicted_categories
0,Fiction,Fiction
1,Fiction,NonFiction
2,Fiction,NonFiction
3,Fiction,NonFiction
4,Fiction,NonFiction
5,Fiction,NonFiction
6,Fiction,Fiction
7,Fiction,Fiction
8,Fiction,Fiction
9,Fiction,NonFiction


In [13]:
# Identify whether the predicted categories is accurate
predictions_df["corrected_prediction"] = (np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0))
predictions_df["corrected_prediction"].sum() / len(predictions_df)

np.float64(0.675)

In [14]:
# Predict the categories for the records which has missing categories
isbns, pred_cats = [], []
miss_cats = books_dataset.loc[books_dataset["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop = True)

for idx in tqdm(range(0, len(miss_cats))):
    sequence = miss_cats["description"][idx]
    pred_cats += [generate_predictions(sequence = sequence, categories = books_categories)]
    isbns += [miss_cats["isbn13"][idx]]

100%|██████████| 1454/1454 [03:05<00:00,  7.83it/s]


In [15]:
# Convert the predicted categories to a DataFrame
missing_prediction_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": pred_cats})
missing_prediction_df.head(10)

Unnamed: 0,isbn13,predicted_categories
0,9780002261982,Fiction
1,9780006280897,NonFiction
2,9780006280934,NonFiction
3,9780006380832,NonFiction
4,9780006470229,Fiction
5,9780006472612,NonFiction
6,9780006483014,Fiction
7,9780006483892,Fiction
8,9780006483908,NonFiction
9,9780006486145,NonFiction


In [16]:
# Merge the missing categories with predicted categories
books_dataset = pd.merge(books_dataset, missing_prediction_df, on = "isbn13", how = "left")

# Replace the missing categories with predicted categories
books_dataset["simple_categories"] = np.where(books_dataset["simple_categories"].isna(), books_dataset["predicted_categories"], books_dataset["simple_categories"])
books_dataset = books_dataset.drop(columns = ["predicted_categories"])

books_dataset.head(10)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982: A new 'Christie for Christmas' ...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897: Lewis' work on the nature of lo...,NonFiction
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934: ""In The Problem of Pain, C.S. L...",NonFiction
5,9780006380832,0006380832,Empires of the Monsoon,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0,Empires of the Monsoon: A History of the India...,9780006380832: Until Vasco da Gama discovered ...,NonFiction
6,9780006470229,000647022X,The Gap Into Madness,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",http://books.google.com/books/content?id=4oXav...,A new-cover reissue of the fourth book in the ...,1994.0,4.15,743.0,103.0,The Gap Into Madness: Chaos and Order,9780006470229: A new-cover reissue of the four...,Fiction
7,9780006472612,0006472613,Master of the Game,Sidney Sheldon,Adventure stories,http://books.google.com/books/content?id=TkTYp...,Kate Blackwell is an enigma and one of the mos...,1982.0,4.11,489.0,43540.0,Master of the Game,9780006472612: Kate Blackwell is an enigma and...,NonFiction
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079: Tricked once more by his wily h...,Fiction
9,9780006483014,0006483011,The Once and Future King,Terence Hanbury White,Arthurian romances,http://books.google.com/books/content?id=Jx6Bv...,An omnibus volume of the author's complete sto...,1996.0,4.04,823.0,2805.0,The Once and Future King,9780006483014: An omnibus volume of the author...,Fiction


In [17]:
books_dataset[books_dataset["categories"].str.lower().isin([
    "romance", "science fiction", "scifi", "fantasy", "horror", "mystery", "thriller", "comedy", "crime", "historical"
])]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
24,9780006513087,0006513085,Gravity,Tess Gerritsen,Science fiction,http://books.google.com/books/content?id=KI66c...,Emma Watson a research physician has been trai...,2004.0,4.04,342.0,8024.0,Gravity,9780006513087: Emma Watson a research physicia...,NonFiction
475,9780099410355,0099410354,Traitor,Matthew Woodring Stover,Science fiction,http://books.google.com/books/content?id=VbICO...,"From the depths of catastrophe, a glimmer of h...",2002.0,4.0,320.0,6765.0,Traitor,"9780099410355: From the depths of catastrophe,...",NonFiction
478,9780099422341,0099422344,Yeats is Dead!,Joseph O'Connor,Comedy,http://books.google.com/books/content?id=DrE3I...,"In aid of Amnesty International, this is a bri...",2002.0,3.39,298.0,34.0,Yeats is Dead!: A Novel by Fifteen Irish Writers,9780099422341: In aid of Amnesty International...,Fiction
491,9780099446729,0099446723,Blackwood Farm,Anne Rice,Horror,http://books.google.com/books/content?id=cIn8T...,"Lestat Is Back, Saviour And Demon, Presiding O...",2003.0,3.86,774.0,26145.0,Blackwood Farm,"9780099446729: Lestat Is Back, Saviour And Dem...",Fiction
1090,9780261102422,0261102427,The Silmarillion,John Ronald Reuel Tolkien,Fantasy,http://books.google.com/books/content?id=22ePu...,Tolkien's Silmarillion is the core work of the...,1999.0,3.91,384.0,253.0,The Silmarillion,9780261102422: Tolkien's Silmarillion is the c...,Fiction
1435,9780340837955,0340837950,Stranger in a Strange Land,Robert A. Heinlein,Science fiction,http://books.google.com/books/content?id=ZQhiP...,"Epic, entertaining, Stranger in a Strange Land...",2005.0,3.92,672.0,563.0,Stranger in a Strange Land,"9780340837955: Epic, entertaining, Stranger in...",Fiction
1439,9780345251220,0345251229,Visions from Nowhere,William Arrow,Science fiction,,"The first novel in the series, ""Return to the ...",1976.0,3.23,183.0,10.0,Visions from Nowhere,"9780345251220: The first novel in the series, ...",Fiction
2845,9780575075597,0575075597,Replay,Ken Grimwood,Fantasy,http://books.google.com/books/content?id=9vmNP...,At forty-three Jeff Winston is tired of his lo...,2005.0,4.16,272.0,412.0,Replay,9780575075597: At forty-three Jeff Winston is ...,NonFiction
2860,9780590254762,0590254766,"The lion, the witch and the wardrobe",Clive Staples Lewis,Fantasy,,Four English school children enter the magic l...,1995.0,4.21,189.0,860.0,"The lion, the witch and the wardrobe",9780590254762: Four English school children en...,NonFiction
3288,9780739423851,0739423851,Wizard's Castle,Diana Wynne Jones,Fantasy,http://books.google.com/books/content?id=hB7hA...,Howl's moving castle - Eldest of three sisters...,2002.0,4.44,376.0,439.0,Wizard's Castle,9780739423851: Howl's moving castle - Eldest o...,Fiction


In [19]:
# Save the Categories Dataset
books_dataset.to_csv("/Users/jkhang/Documents/GitHub/Semantic-Book-Recommender/dataset/books_with_categories.csv", index = False)