In [2]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [4]:
books["categories"].value_counts().reset_index() # checking the available categories and their counts

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
474,Conspiracies,1
475,Brothers and sisters,1
476,Rock musicians,1
477,Community life,1


In [6]:
books["categories"].value_counts().reset_index().query("count > 50") # checking the more important categories (count > 50)

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Philosophy,117
6,Religion,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [8]:
# mapping the categories to either fiction or nonfiction
category_mapping = {
    'Fiction': "Fiction",
    'Juvenile Fiction': "Fiction",
    'Biography & Autobiography' : "Nonfiction",
    'History' : "Nonfiction",
    'Literary Criticism' : "Nonfiction",
    'Philosophy' : "Nonfiction",
    'Religion' : "Nonfiction",
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction' : "Nonfiction",
    'Science' : "Nonfiction",
    'Poetry': "Fiction",
}

In [10]:
books["revised_categories"] = books["categories"].map(category_mapping)

In [12]:
books[~(books["revised_categories"].isna())] # checking the number of books that fell into either fiction or nonfiction

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,tagged_description,revised_categories
0,9780002005883,0002005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,9780002005883 A NOVEL THAT READERS and critics...,Fiction
2,9780006178736,0006178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,"9780006178736 A memorable, mesmerizing heroine...",Fiction
8,9780006482079,0006482074,Warhost of Vastmark,,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,9780006482079 Tricked once more by his wily ha...,Fiction
30,9780006646006,000664600X,Ocean Star Express,,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,9780006646006 Joe and his parents are enjoying...,Fiction
46,9780007121014,0007121016,Taken at the Flood,,Agatha Christie,Fiction,http://books.google.com/books/content?id=3gWlx...,A Few Weeks After Marrying An Attractive Young...,2002.0,3.71,352.0,8852.0,9780007121014 A Few Weeks After Marrying An At...,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,9781933648279,1933648279,Night Has a Thousand Eyes,,Cornell Woolrich,Fiction,http://books.google.com/books/content?id=3Gk6s...,"""Cornell Woolrich's novels define the essence ...",2007.0,3.77,344.0,680.0,"9781933648279 ""Cornell Woolrich's novels defin...",Fiction
5188,9784770028969,4770028962,Coin Locker Babies,,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,9784770028969 Rescued from the lockers in whic...,Fiction
5189,9788122200850,8122200850,"Cry, the Peacock",,Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,9788122200850 This book is the story of a youn...,Fiction
5195,9788185300535,8185300534,I Am that,Talks with Sri Nisargadatta Maharaj,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,9788185300535 This collection of the timeless ...,Nonfiction


In [14]:
# using the bart-large-mnli transformer model
# Use a pipeline as a high-level helper
from transformers import pipeline

book_categories = ["Fiction","Nonfiction"]

pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use cpu


In [16]:
sequence = books.loc[books["revised_categories"] == "Fiction", "description"].reset_index(drop=True)[0]

In [None]:
pipe(sequence, book_categories)

In [None]:
import numpy as np

max_index = np.argmax(pipe(sequence, book_categories)["scores"])
max_label = pipe(sequence, book_categories)["labels"][max_index]
max_label

In [None]:
def make_predictions(sequence, categories):
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]
    return max_label

In [None]:
from tqdm import tqdm

actual_categories = []
predicted_categories = []

for i in tqdm(range(0,300)):
  sequence = books.loc[books["revised_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
  predicted_categories += [make_predictions(sequence, book_categories)]
  actual_categories += ["Fiction"]

In [None]:
for i in tqdm(range(0,300)):
  sequence = books.loc[books["revised_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
  predicted_categories += [make_predictions(sequence, book_categories)]
  actual_categories += ["Nonfiction"]

In [None]:
predictions_df = pd.DataFrame({"actual_categories": actual_categories, "predicted_categories": predicted_categories})

In [None]:
predictions_df.head()

In [None]:
predictions_df["correct_predictions"] = (
    np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
)

In [None]:
predictions_df["correct_predictions"].sum()/ len(predictions_df)

In [None]:
isbns = []
predicted_categories_new = []

missing_categories = books.loc[books["revised_categories"].isna(), ["isbn13","description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(0,len(missing_categories))):
  sequence = missing_categories["description"][i]
  predicted_categories_new += [make_predictions(sequence, book_categories)]
  isbns += [missing_categories["isbn13"][i]]

In [None]:
missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_categories_new})

In [None]:
missing_predicted_df

In [None]:
# merging the findings into the original books dataframe

books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
books["revised_categories"] = np.where(books["revised_categories"].isna(), books["predicted_categories"], books["revised_categories"])
books = books.drop(columns=["predicted_categories"])

In [None]:
books.value_counts("revised_categories")

In [None]:
# saving the new dataframe to a csv to re-use

books.to_csv("books_categories_classified.csv", index = False)